def train_loop(exe, train_program, trainer_id): total_time = 0. for pass_id in xrange(conf.num_passes): train_pass_acc_evaluator.reset() start_time = time.time() total_samples = 0 with profiler.profiler("CPU", 'total', profile_path='./profile_res_%d' % trainer_id) as prof: for batch_id, data in enumerate(train_reader()): batch_start = time.time() cost_val, acc_val, size_val = exe.run( train_program, feed=feeder.feed(data), fetch_list=[avg_cost, batch_acc_var, batch_size_var]) train_pass_acc_evaluator.add(value=acc_val, weight=size_val) total_samples += float(size_val) if batch_id and batch_id % conf.log_period == 0: print( "Pass id: %d, batch id: %d, cost: %f, pass_acc: %f, speed: %f, time: %f" % (pass_id, batch_id, cost_val, train_pass_acc_evaluator.eval(), float(size_val) / (time.time() - batch_start), time.time() - batch_start)) end_time = time.time() total_time += (end_time - start_time) pass_test_acc = test(exe) print("Pass id: %d, test_acc: %f, speed: %f" % (pass_id, pass_test_acc, total_samples / (end_time - start_time))) print("Total train time: %f" % (total_time))
def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False, use_nccl=False, use_gpu=False): """ Run a single test, returns the fetch values Args: place(Place): the computation place. use_parallel(bool): Whether use parallel.for or not. Returns: Fetched numpy arrays. """ if isinstance(fetch, basestring): fetch = [fetch] main = fluid.Program() startup = fluid.Program() # Fix seed main.random_seed = 10 startup.random_seed = 10 with fluid.program_guard(main, startup): generator = callback() # Automatically insert parallel do if use_parallel = True if use_parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) data = next(generator) if isinstance(data, fluid.Variable): data = [data] with pd.do(): ins = map(pd.read_input, data) if len(ins) == 1: ins = ins[0] loss = generator.send(ins) # patch input pd.write_output(loss) loss = pd() else: data = next(generator) loss = generator.send(data) self.assertIsNotNone(loss) avg_loss = fluid.layers.mean(loss) fluid.backward.append_backward(loss=avg_loss) exe = fluid.Executor(place) exe.run(startup) if use_gpu: profile_type = 'GPU' else: profile_type = 'CPU' with profiler.profiler(profile_type, 'total', '/tmp/profiler'): return exe.run(main, feed=feed, fetch_list=fetch)
def net_profiler(self, state, profile_path='/tmp/profile'): enable_if_gpu = state == 'GPU' or state == "All" if enable_if_gpu and not core.is_compiled_with_cuda(): return startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") outs = exe.run(main_program, feed={ 'x': x, 'y': y }, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval()
def train_loop(exe, trainer_prog, reader): exe.run(fluid.default_startup_program()) for epoch_id in xrange(epoch_num): print "epoch_%d start" % epoch_id pass_start = time.time() i = 0 with profiler.profiler( "All", 'total', profile_path="/usr/local/nvidia/lib64/tmp") as prof: for data in reader(): i += 1 lod_src_wordseq = to_lodtensor(map(lambda x: x[0], data), place) lod_dst_wordseq = to_lodtensor(map(lambda x: x[1], data), place) ret_average_cost = exe.run(trainer_prog, feed={ "src_wordseq": lod_src_wordseq, "dst_wordseq": lod_dst_wordseq }, fetch_list=[average_cost]) average_ppl = math.exp(ret_average_cost[0]) if i % 100 == 0: print "step %d ppl: %.3f" % (i, average_ppl) print "total steps:", i spent = time.time() - pass_start print("Pass %d end, spent: %f, speed: %f" % (epoch_id, spent, 42068 / spent))
def net_profiler(self, state, profile_path='/tmp/profile'): enable_if_gpu = state == 'GPU' or state == "All" if enable_if_gpu and not core.is_compiled_with_cuda(): return startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros( shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") outs = exe.run(main_program, feed={'x': x, 'y': y}, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval()
def profile_context(profile=True): """ profile_context """ if profile: with profiler.profiler('All', 'total', './profile_dir/profile_file_tmp'): yield else: yield
def train_loop(exe, trainer_prog): iters = 0 ts = time.time() train_pass_acc = fluid.average.WeightedAverage() for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 train_pass_acc.reset() def run_step(batch_id, data): img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) loss, acc, b_size = exe.run( trainer_prog, feed={ "pixel": img_data, "label": y_data }, fetch_list=[avg_cost, batch_acc, batch_size]) return loss, acc, b_size if args.profile and args.task_index == 0: # warmup. for batch_id, data in enumerate(train_reader()): if batch_id > 5: break run_step(batch_id, data) with profiler.profiler('All', 'total', '/tmp/profile_vgg'): for batch_id, data in enumerate(train_reader()): if batch_id > 5: break run_step(batch_id, data) for batch_id, data in enumerate(train_reader()): ts = time.time() loss, acc, b_size = run_step(batch_id, data) iters += 1 num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " "Speed = %.2f img/s" % (pass_id, iters, loss, acc, len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = train_pass_acc.eval() pass_test_acc = test(exe) print("Task:%d Pass = %d, Training performance = %f imgs/s, " "Train accuracy = %f, Test accuracy = %f\n" % (args.task_index, pass_id, num_samples / pass_elapsed, pass_train_acc, pass_test_acc))
def main(): args = parser.parse_args() print_arguments(args) if args.profile: if args.use_gpu: with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: train(args) else: with profiler.profiler("CPU", sorted_key='total') as cpuprof: train(args) else: train(args)
def net_profiler(self, exe, state, tracer_option, batch_range=None, use_parallel_executor=False, use_new_api=False): main_program, startup_program, avg_cost, batch_size, batch_acc = self.build_program( compile_program=use_parallel_executor) exe.run(startup_program) profile_path = self.get_profile_path() if not use_new_api: with profiler.profiler(state, 'total', profile_path, tracer_option): pass_acc_calculator = fluid.average.WeightedAverage() for iter in range(10): if iter == 2: profiler.reset_profiler() self.run_iter(exe, main_program, [avg_cost, batch_acc, batch_size], pass_acc_calculator) else: options = utils.ProfilerOptions(options={ 'state': state, 'sorted_key': 'total', 'tracer_level': tracer_option, 'batch_range': [0, 10] if batch_range is None else batch_range, 'profile_path': profile_path }) with utils.Profiler(enabled=True, options=options) as prof: pass_acc_calculator = fluid.average.WeightedAverage() for iter in range(10): self.run_iter(exe, main_program, [avg_cost, batch_acc, batch_size], pass_acc_calculator) utils.get_profiler().record_step() if batch_range is None and iter == 2: utils.get_profiler().reset() self.check_profile_result(profile_path)
def train_loop(exe, trainer_prog): iters = 0 ts = time.time() for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 accuracy.reset(exe) with profiler.profiler(args.device, 'total') as prof: for batch_id, data in enumerate(train_reader()): ts = time.time() img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) loss, acc = exe.run(trainer_prog, feed={ "pixel": img_data, "label": y_data }, fetch_list=[avg_cost] + accuracy.metrics) iters += 1 num_samples += len(data) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" % (pass_id, iters, loss, acc, time.time() - ts) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = accuracy.eval(exe) pass_test_acc = test(exe) print( "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" % (pass_id, num_samples / pass_elapsed, pass_train_acc, pass_test_acc))
def train_loop(exe, trainer_prog, trainer_id=0, reader=train_reader): embedding_name = 'emb' embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set(word_vector_values, place) batch_id = 0 for pass_id in xrange(num_passes): chunk_evaluator.reset(exe) start_time = time.time() with profiler.profiler( "CPU", 'total', profile_path="/usr/local/nvidia/lib64/tmp") as prof: for data in reader(): cost, batch_precision, batch_recall, batch_f1_score = exe.run( trainer_prog, feed=feeder.feed(data), fetch_list=[avg_cost] + chunk_evaluator.metrics) if batch_id % 5 == 0: print("Pass " + str(pass_id) + ", Batch " + str(batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(batch_precision[0]) + ", Recall " + str(batch_recall[0]) + ", F1_score" + str(batch_f1_score[0])) batch_id = batch_id + 1 pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval( exe) spent = time.time() - start_time print("pass_id: %d, precision: %f, recall: %f, f1: %f, spent: %f, speed: %f" % \ (pass_id, pass_precision, pass_recall, pass_f1_score, spent, 14987.0 / spent)) pass_precision, pass_recall, pass_f1_score = test( exe, chunk_evaluator, inference_program, test_reader, place) print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score))
def train_loop(use_gpu, trainer_prog, trainer_id=0): place = core.CPUPlace() if not use_gpu else core.CUDAPlace(0) iters = 0 accuracy = fluid.average.WeightedAverage() start_time = time.time() num_samples = 0 accuracy.reset() feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe = fluid.Executor(place) for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 with profiler.profiler("All", 'total', profile_path="/usr/local/nvidia/lib64/tmp") as prof: for batch_id, data in enumerate(train_reader()): batch_st = time.time() loss, acc, weight = exe.run( trainer_prog, feed=feeder.feed(data), fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) iters += 1 num_samples += len(data) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, batch spent %f" % (pass_id, iters, loss, acc, time.time() - batch_st) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = accuracy.eval() pass_test_acc = test(exe) print( "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" % (pass_id, num_samples / pass_elapsed, pass_train_acc, pass_test_acc))
optimizer._global_learning_rate().numpy(), total_loss / args.batch_size / args.log_period)) total_loss = 0.0 if total_step > 0 and total_step % args.save_model_period == 0: if fluid.dygraph.parallel.Env().dev_id == 0: model_file = os.path.join(args.save_model_dir, 'step_{}'.format(total_step)) fluid.save_dygraph(ocr_attention.state_dict(), model_file) print('step_{}.pdparams saved!'.format(total_step)) if total_step > 0 and total_step % args.eval_period == 0: ocr_attention.eval() evaluate(ocr_attention, test_reader, args.batch_size) ocr_attention.train() batch_id += 1 if __name__ == '__main__': args = parser.parse_args() print_arguments(args) if args.profile: if args.use_gpu: with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: train(args) else: with profiler.profiler("CPU", sorted_key='total') as cpuprof: train(args) else: train(args)
def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', '/tmp/paddingrnn.profile'): yield else: yield
def profile(args): """profile the training process. """ if not args.first_batches_to_skip < args.max_batch_num: raise ValueError("arg 'first_batches_to_skip' must be smaller than " "'max_batch_num'.") if not args.first_batches_to_skip >= 0: raise ValueError( "arg 'first_batches_to_skip' must not be smaller than 0.") _, avg_cost, accuracy = stacked_lstmp_model(frame_dim=args.frame_dim, hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, stacked_num=args.stacked_num, class_num=args.class_num, parallel=args.parallel) optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=args.learning_rate, decay_steps=1879, decay_rate=1 / 1.2, staircase=True)) optimizer.minimize(avg_cost) place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) ltrans = [ trans_add_delta.TransAddDelta(2, 2), trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var), trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5) ] data_reader = reader.AsyncDataReader(args.feature_lst, args.label_lst, -1, split_sentence_threshold=1024) data_reader.set_transformers(ltrans) feature_t = fluid.LoDTensor() label_t = fluid.LoDTensor() sorted_key = None if args.sorted_key is 'None' else args.sorted_key with profiler.profiler(args.device, sorted_key) as prof: frames_seen, start_time = 0, 0.0 for batch_id, batch_data in enumerate( data_reader.batch_iterator(args.batch_size, args.minimum_batch_size)): if batch_id >= args.max_batch_num: break if args.first_batches_to_skip == batch_id: profiler.reset_profiler() start_time = time.time() frames_seen = 0 # load_data (features, labels, lod, _) = batch_data features = np.reshape(features, (-1, 11, 3, args.frame_dim)) features = np.transpose(features, (0, 2, 1, 3)) feature_t.set(features, place) feature_t.set_lod([lod]) label_t.set(labels, place) label_t.set_lod([lod]) frames_seen += lod[-1] outs = exe.run(fluid.default_main_program(), feed={ "feature": feature_t, "label": label_t }, fetch_list=[avg_cost, accuracy] if args.print_train_acc else [], return_numpy=False) if args.print_train_acc: print("Batch %d acc: %f" % (batch_id, lodtensor_to_ndarray(outs[1])[0])) else: sys.stdout.write('.') sys.stdout.flush() time_consumed = time.time() - start_time frames_per_sec = frames_seen / time_consumed print("\nTime consumed: %f s, performance: %f frames/s." % (time_consumed, frames_per_sec))
def train_parallel_exe(args): class_dim = 1000 image_shape = [3, 224, 224] if args.use_recordio: reader = fluid.layers.open_recordio_file( filename='./flowers_bs_12_3_224_224.recordio', shapes=[[-1, 3, 224, 224], [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64']) image, label = fluid.layers.read_file(reader) else: image = fluid.layers.data( name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_reader = feeder.decorate_reader( paddle.batch( train() if args.use_fake_reader else flowers.train(), batch_size=args.batch_size_per_gpu), multi_devices=True) train_reader_iter = train_reader() if args.fix_data_in_gpu: data = train_reader_iter.next() feed_data = data prediction, avg_cost, accuracy, accuracy5 = net_conf(image, label, class_dim) add_optimizer(args, avg_cost) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if args.balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( loss_name=avg_cost.name, use_cuda=True, build_strategy=build_strategy, exec_strategy=exec_strategy) time_record = [] train_start = time.time() img_count = 0 for batch_id in xrange(args.number_iteration): if args.do_profile and batch_id >= 5 and batch_id < 8: with profiler.profiler('All', 'total', '/tmp/profile_parallel_exe') as prof: if args.use_recordio: exe.run([]) else: exe.run([], feed=feed_data if args.fix_data_in_gpu else train_reader_iter.next()) continue if args.use_recordio: cost_val = exe.run([avg_cost.name] if (batch_id + 1) % args.display_step == 0 else []) else: cost_val = exe.run( [avg_cost.name] if (batch_id + 1) % args.display_step == 0 else [], feed=feed_data if args.fix_data_in_gpu else train_reader_iter.next()) img_count += args.batch_size if (batch_id + 1) % args.display_step == 0: train_stop = time.time() step_time = train_stop - train_start time_record.append(step_time) print("iter=%d, cost=%s, elapse=%f, img/sec=%f" % ((batch_id + 1), np.array(cost_val[0]), step_time, img_count / step_time)) img_count = 0 train_start = time.time() skip_time_record = args.skip_first_steps / args.display_step time_record[0:skip_time_record] = [] if args.show_record_time: for i, ele in enumerate(time_record): print("iter:{0}, time consume:{1}".format(i, ele)) img_count = ( args.number_iteration - args.skip_first_steps) * args.batch_size print("average time:{0}, img/sec:{1}".format( np.mean(time_record), img_count / np.sum(time_record)))
def profile_context(profile=True, profiler_path='./seq2seq.profile'): if profile: with profiler.profiler('All', 'total', profiler_path): yield else: yield
def train_loop(exe, trainer_prog): iters = 0 ts = time.time() train_pass_acc = fluid.average.WeightedAverage() acc_4passes = None converge_speed = None for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 train_pass_acc.reset() def run_step(batch_id, data): img_data = np.array( map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) loss, acc, b_size = exe.run( trainer_prog, feed={ "pixel": img_data, "label": y_data }, fetch_list=[avg_cost, batch_acc, batch_size]) return loss, acc, b_size if args.profile and args.task_index == 0: # warmup. for batch_id, data in enumerate(train_reader()): if batch_id > 5: break run_step(batch_id, data) with profiler.profiler('All', 'total', '/tmp/profile_vgg'): for batch_id, data in enumerate(train_reader()): if batch_id > 5: break run_step(batch_id, data) for batch_id, data in enumerate(train_reader()): ts = time.time() loss, acc, b_size = run_step(batch_id, data) iters += 1 num_samples += len(data) train_pass_acc.add(value=acc, weight=b_size) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " "Speed = %.2f img/s" % (pass_id, iters, loss, acc, len(data) / (time.time() - ts)) ) # The accuracy is the accumulation of batches, but not the current batch. # terminate training when acc_target reaches if args.acc_target and acc >= args.acc_target: converge_speed = time.time() - start_time pass_elapsed = time.time() - start_time pass_train_acc = train_pass_acc.eval() pass_test_acc = test(exe) if pass_id == 4: acc_4passes = pass_train_acc msgs = [] msgs.append("pass = %d" % pass_id) msgs.append("train_speed = %f" % num_samples / pass_elapsed) msgs.append("train_accuracy = %f" % pass_train_acc) msgs.append("test_accuracy = %f" % pass_test_acc) if isinstance(acc_4passes, float): msgs.append("acc_4passes = %f" % acc_4passes) if isinstance(converge_speed, int): msgs.append("converge_speed = %d" % converge_speed) print("**metrics_data: " + ", ".join(msgs))
sys.stdout.flush() batch_id += 1 total_batch_num = total_batch_num + 1 # this is for benchmark # profiler tools for benchmark if args.profile and epoch == 0 and batch_id == 10: profiler.reset_profiler() elif args.profile and epoch == 0 and batch_id == 15: return if args.run_test and not args.run_ce: test(epoch) if args.save_checkpoints and not args.run_ce: checkpoints(epoch) if args.run_ce: print("kpis,g_train_cost,{}".format(np.mean(losses[0]))) print("kpis,d_train_cost,{}".format(np.mean(losses[1]))) print("kpis,duration,{}".format(t_time / args.epoch)) if __name__ == "__main__": args = parser.parse_args() print_arguments(args) if args.profile: if args.use_gpu: with profiler.profiler('All', 'total', args.profiler_path) as prof: train(args) else: with profiler.profiler("CPU", sorted_key='total') as cpuprof: train(args) else: train(args)
sgd_optimizer.minimize(avg_cost) # The training data set. train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=51200), batch_size=conf.batch_size) # The testing data set. test_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.imdb.test(word_dict), buf_size=51200), batch_size=conf.batch_size) if conf.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) print("Done Inferring.") if __name__ == '__main__': args = parse_args() with profiler.profiler("GPU", 'total') as prof: main(args.dict_path)
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] num_iterations = cfg.max_iter devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) total_batch_size = devices_num * cfg.TRAIN.im_per_batch model = model_builder.RCNN( add_conv_body_func=resnet.add_ResNet50_conv4_body, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, use_pyreader=cfg.use_pyreader, use_random=False) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = [loss] boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] optimizer = fluid.optimizer.Momentum( learning_rate=exponential_with_warmup_decay( learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=500, warmup_factor=1.0 / 3.0), regularization=fluid.regularizer.L2Decay(0.0001), momentum=0.9) optimizer.minimize(loss) fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name) if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=False) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: train_reader = reader.train(batch_size=total_batch_size, shuffle=False) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def run(iterations): reader_time = [] run_time = [] total_images = 0 for batch_id in range(iterations): start_time = time.time() data = next(train_reader()) end_time = time.time() reader_time.append(end_time - start_time) start_time = time.time() if cfg.parallel: outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) else: outs = exe.run(fluid.default_main_program(), fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) end_time = time.time() run_time.append(end_time - start_time) total_images += len(data) print("Batch {:d}, loss {:.6f} ".format(batch_id, np.mean(outs[0]))) return reader_time, run_time, total_images def run_pyreader(iterations): reader_time = [0] run_time = [] total_images = 0 py_reader.start() try: for batch_id in range(iterations): start_time = time.time() if cfg.parallel: outs = train_exe.run( fetch_list=[v.name for v in fetch_list]) else: outs = exe.run(fluid.default_main_program(), fetch_list=[v.name for v in fetch_list]) end_time = time.time() run_time.append(end_time - start_time) total_images += devices_num print("Batch {:d}, loss {:.6f} ".format( batch_id, np.mean(outs[0]))) except fluid.core.EOFException: py_reader.reset() return reader_time, run_time, total_images run_func = run if not cfg.use_pyreader else run_pyreader # warm-up run_func(2) # profiling start = time.time() if cfg.use_profile: with profiler.profiler('GPU', 'total', '/tmp/profile_file'): reader_time, run_time, total_images = run_func(num_iterations) else: reader_time, run_time, total_images = run_func(num_iterations) end = time.time() total_time = end - start print( "Total time: {0}, reader time: {1} s, run time: {2} s, images/s: {3}". format(total_time, np.sum(reader_time), np.sum(run_time), total_images / total_time))
def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', args.profiler_path): yield else: yield
def profile(args): print args if args.device == 'CPU': TrainTaskConfig.use_gpu = False if not TrainTaskConfig.use_gpu: place = fluid.CPUPlace() dev_count = multiprocessing.cpu_count() else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: exe.run(fluid.framework.default_startup_program()) # Disable all sorts for they will be done in the 1st batch. train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type='none', shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], # count start and end tokens out max_length=ModelHyperParams.max_length - 2, clip_last_batch=False) train_data = read_multiple(reader=train_data.batch_generator, count=dev_count if args.use_token_batch else 1) if dev_count > 1: build_strategy = fluid.BuildStrategy() build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=fluid.default_main_program(), build_strategy=build_strategy) print("Warming up ...") train_loop(exe if dev_count == 1 else train_exe, fluid.default_main_program(), False, 3, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) print("\nProfiling ...") if dev_count == 1: with profiler.profiler('All', 'total', '/tmp/profile_file'): total_time, exec_time = train_loop(exe, fluid.default_main_program(), True, args.num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) else: total_time, exec_time = train_loop(train_exe, fluid.default_main_program(), True, args.num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) print("Elapsed time: total %f s, in executor %f s" % (total_time, exec_time))
latencies = batch_times[args.skip_pass_num:] latency_avg = np.average(latencies) latency_std = np.std(latencies) latency_pc99 = np.percentile(latencies, 99) wps_avg = np.average(wpses) wps_std = np.std(wpses) wps_pc01 = np.percentile(wpses, 1) # Benchmark output print('\nTotal passes (incl. warm-up): %d' % (total_passes)) print('Total iterations (incl. warm-up): %d' % (all_iters)) print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size)) print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' % (latency_avg, latency_std, latency_pc99)) print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' % (wps_avg, wps_std, wps_pc01)) if __name__ == "__main__": args = parse_args() print_arguments(args) if args.profile: if args.device == 'GPU': with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: infer(args) else: with profiler.profiler('CPU', sorted_key='total') as cpuprof: infer(args) else: infer(args)
def net_profiler(self, state, use_parallel_executor=False): profile_path = os.path.join(tempfile.gettempdir(), "profile") open(profile_path, "w").write("") startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros( shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) if use_parallel_executor: pe = fluid.ParallelExecutor( state != 'CPU', loss_name=avg_cost.name, main_program=main_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") if use_parallel_executor: pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name]) continue outs = exe.run(main_program, feed={'x': x, 'y': y}, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval() data = open(profile_path, 'rb').read() self.assertGreater(len(data), 0) profile_pb = profiler_pb2.Profile() profile_pb.ParseFromString(data) self.assertGreater(len(profile_pb.events), 0) for event in profile_pb.events: if event.type == profiler_pb2.Event.GPUKernel: if not event.detail_info and not event.name.startswith("MEM"): raise Exception( "Kernel %s missing event. Has this kernel been recorded by RecordEvent?" % event.name) elif event.type == profiler_pb2.Event.CPU and ( event.name.startswith("Driver API") or event.name.startswith("Runtime API")): print("Warning: unregister", event.name)
def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', 'seq2seq.profile'): yield else: yield
def train_parallel_do(args): class_dim = 1000 image_shape = [3, 224, 224] image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places, use_nccl=True) with pd.do(): image_ = pd.read_input(image) label_ = pd.read_input(label) out = SE_ResNeXt(input=image_, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label_) avg_cost = fluid.layers.mean(x=cost) accuracy = fluid.layers.accuracy(input=out, label=label_) pd.write_output(avg_cost) pd.write_output(accuracy) avg_cost, accuracy = pd() avg_cost = fluid.layers.mean(x=avg_cost) # accuracy = fluid.layers.mean(x=accuracy) add_optimizer(args, avg_cost) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) train_reader = paddle.batch( train() if args.use_fake_reader else flowers.train(), batch_size=args.batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_reader_iter = train_reader() if args.fix_data_in_gpu: data = train_reader_iter.next() feed_data = feeder.feed(data) time_record = [] img_count = 0 train_start = time.time() for batch_id in range(args.number_iteration): if args.do_profile and batch_id >= 5 and batch_id < 8: with profiler.profiler('All', 'total', '/tmp/profile_parallel_do') as prof: exe.run(fluid.default_main_program(), feed=feed_data if args.fix_data_in_gpu else feeder.feed(train_reader_iter.next()), fetch_list=[], use_program_cache=True) continue cost_val = exe.run(fluid.default_main_program(), feed=feed_data if args.fix_data_in_gpu else feeder.feed(train_reader_iter.next()), fetch_list=[avg_cost.name] if (batch_id + 1) % args.display_step == 0 else [], use_program_cache=True) img_count += args.batch_size if (batch_id + 1) % args.display_step == 0: train_stop = time.time() step_time = train_stop - train_start time_record.append(step_time) print("iter=%d, cost=%s, elapse=%f, img/sec=%f" % ((batch_id + 1), np.array(cost_val[0]), step_time, img_count / step_time)) img_count = 0 train_start = time.time() skip_time_record = args.skip_first_steps / args.display_step time_record[0:skip_time_record] = [] if args.show_record_time: for i, ele in enumerate(time_record): print("iter:{0}, time consume:{1}".format(i, ele)) img_count = ( args.number_iteration - args.skip_first_steps) * args.batch_size print("average time:{0}, img/sec:{1}".format( np.mean(time_record), img_count / np.sum(time_record)))
def profile_context(profile=True): if profile: with profiler.profiler('All', 'total', '/tmp/profile_file2'): yield else: yield
def train(args, config, train_file_list, optimizer_method): learning_rate = args.learning_rate batch_size = args.batch_size height = args.resize_h width = args.resize_w use_gpu = args.use_gpu use_pyramidbox = args.use_pyramidbox model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model num_iterations = args.num_iteration parallel = args.parallel num_classes = 2 image_shape = [3, height, width] startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): py_reader = fluid.layers.py_reader( capacity=8, shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]], lod_levels=[0, 1, 1, 1], dtypes=["float32", "float32", "float32", "int32"], use_double_buffer=True) with fluid.unique_name.guard(): image, face_box, head_box, gt_label = fluid.layers.read_file( py_reader) fetches = [] network = PyramidBox(image=image, face_box=face_box, head_box=head_box, gt_label=gt_label, sub_network=use_pyramidbox) if use_pyramidbox: face_loss, head_loss, loss = network.train() fetches = [face_loss, head_loss] else: loss = network.vgg_ssd_loss() fetches = [loss] devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) batch_size_per_device = batch_size // devices_num steps_per_pass = 12880 // batch_size boundaries = [ steps_per_pass * 50, steps_per_pass * 80, steps_per_pass * 120, steps_per_pass * 140 ] values = [ learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate * 0.1, learning_rate * 0.01 ] if optimizer_method == "momentum": optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=boundaries, values=values), momentum=0.9, regularization=fluid.regularizer.L2Decay(0.0005), ) else: optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay( boundaries, values), regularization=fluid.regularizer.L2Decay(0.0005), ) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) start_pass = 0 if pretrained_model: if pretrained_model.isdigit(): start_pass = int(pretrained_model) + 1 pretrained_model = os.path.join(model_save_dir, pretrained_model) print("Resume from %s " % (pretrained_model)) if not os.path.exists(pretrained_model): raise ValueError( "The pre-trained model path [%s] does not exist." % (pretrained_model)) def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_gpu, loss_name=loss.name, main_program=train_prog) train_reader = reader.train(config, train_file_list, batch_size_per_device, shuffle=False, use_multiprocessing=True, num_workers=8, max_queue=24) py_reader.decorate_paddle_reader(train_reader) def run(iterations): # global feed_data py_reader.start() run_time = [] for batch_id in range(iterations): start_time = time.time() if parallel: fetch_vars = train_exe.run( fetch_list=[v.name for v in fetches]) else: fetch_vars = exe.run(train_prog, fetch_list=fetches) end_time = time.time() run_time.append(end_time - start_time) fetch_vars = [np.mean(np.array(v)) for v in fetch_vars] if not args.use_pyramidbox: print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0])) else: print("Batch {0}, face loss {1}, head loss {2}".format( batch_id, fetch_vars[0], fetch_vars[1])) return run_time # start-up run(2) # profiling start = time.time() if not parallel: with profiler.profiler('All', 'total', '/tmp/profile_file'): run_time = run(num_iterations) else: run_time = run(num_iterations) end = time.time() total_time = end - start print("Total time: {0}, reader time: {1} s, run time: {2} s".format( total_time, total_time - np.sum(run_time), np.sum(run_time)))