job.load_trainer_job(job_path, trainer_id) job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer trainer = FLTrainerFactory().create_fl_trainer(job) trainer._current_ep = "127.0.0.1:{}".format(9000 + trainer_id) place = fluid.CPUPlace() trainer.start(place) r = Gru4rec_Reader() train_reader = r.reader(train_file_dir, place, batch_size=125) output_folder = "model_node4" epoch_i = 0 while not trainer.stop(): epoch_i += 1 train_step = 0 for data in train_reader(): #print(np.array(data['src_wordseq'])) ret_avg_cost = trainer.run(feed=data, fetch=["mean_0.tmp_0"]) train_step += 1 if train_step == trainer._step: break avg_ppl = np.exp(ret_avg_cost[0]) newest_ppl = np.mean(avg_ppl) print("{} Epoch {} start train, train_step {}, ppl {}".format (time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), epoch_i, train_step, newest_ppl)) save_dir = (output_folder + "/epoch_%d") % epoch_i if trainer_id == 0: print("start save") trainer.save_inference_program(save_dir) if epoch_i >= 5: break
E = 2 * epsilon * math.sqrt(step * sample_ratio) print("({0}, {1})-DP".format(E, delta)) output_folder = "model_node%d" % trainer_id epoch_id = 0 step = 0 while not trainer.stop(): epoch_id += 1 if epoch_id > 10: break print("{} Epoch {} start train".format( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), epoch_id)) for step_id, data in enumerate(train_reader()): acc = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0"]) step += 1 # print("acc:%.3f" % (acc[0])) acc_val = train_test(train_test_program=test_program, train_test_reader=test_reader, train_test_feed=feeder) print("Test with epoch %d, accuracy: %s" % (epoch_id, acc_val)) compute_privacy_budget(sample_ratio=0.001, epsilon=0.1, step=step, delta=0.00001) save_dir = (output_folder + "/epoch_%d") % epoch_id trainer.save_inference_program(output_folder)
job = FLRunTimeJob() job.load_trainer_job(job_path, trainer_id) job._scheduler_ep = "127.0.0.1:9091" # Inform the scheduler IP to trainer # print(job._trainer_send_program) trainer = FLTrainerFactory().create_fl_trainer(job) use_cuda = False place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer._current_ep = "127.0.0.1:8192" trainer.start(place=place) trainer._logger.setLevel(logging.DEBUG) g = reader() if trainer_id > 0: for i in range(trainer_id): next(g) data = next(g) print(data) output_folder = "fl_model" step_i = 0 while not trainer.stop(): step_i += 1 print("batch %d start train" % step_i) trainer.run(feed=data, fetch=[]) if trainer_id == 0: print("start saving model") trainer.save_inference_program(output_folder) if step_i >= 10: break
# Summary ########### data_writer = SummaryWriter(logdir=join(join(params["federated"]["logdir"],"data"),f"client_{trainer_id}")) # Run ######### round_id = 0 while not trainer.stop(): round_id += 1 if round_id > params["federated"]["num_round"]: break for e in range(params["federated"]["num_epoch"]): for data in train_reader(): trainer.run(feeder.feed(data), fetch=job._target_names) train_metrics = metrics(trainer.exe, test_program,feeder, train_reader, job._target_names) val_metrics = metrics(trainer.exe, test_program,feeder, val_reader, job._target_names) if trainer_id == 0: test_metrics = metrics(trainer.exe, test_program,feeder, test_reader, job._target_names) txt_log = "{} Round {} ".format(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), round_id) for metric in range(len(job._target_names)): metric_name = job._target_names[metric] txt_log += f"Train {metric_name}: {train_metrics[metric]} Val {metric_name}: {val_metrics[metric]}" data_writer.add_scalar(f"train/{metric_name}", train_metrics[metric], round_id) data_writer.add_scalar(f"val/{metric_name}", val_metrics[metric], round_id)
is_crowd = fluid.layers.data(name='is_crowd', shape=[None, 1], dtype='int32', lod_level=1) place = fluid.CUDAPlace(trainer_id) feeder = fluid.DataFeeder( feed_list=[image, im_info, im_id, gt_bbox, gt_class, is_crowd], place=place) output_folder = "5_model_node%d" % trainer_id epoch_id = 0 step = 0 para_dir = "faster_rcnn_program" while not trainer.stop(): epoch_id += 1 if epoch_id > 120: break print("epoch %d start train" % (epoch_id)) test_class = DataReader() data_loader = test_class.test_loader() for step_id, data in enumerate(data_loader): acc = trainer.run(feeder.feed(data), fetch=['sum_0.tmp_0']) step += 1 print("step: {}, loss: {}".format(step, acc)) if trainer_id == 0: save_dir = (output_folder + "/epoch_%d") % epoch_id trainer.save(para_dir, save_dir)
acc_set.append(float(acc_np[0])) loss_set.append(float(loss[0])) acc_val_mean = np.array(acc_set).mean() avg_loss_mean = np.array(loss_set).mean() return acc_val_mean, avg_loss_mean output_folder = "models/model_node%d" % trainer_id epoch_id = 0 step = 0 while not trainer.stop(): epoch_id += 1 if epoch_id > 10: break print("{} Epoch {} start train".format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), epoch_id)) for step_id, data in enumerate(train_reader()): acc, loss = trainer.run(feeder.feed(data), fetch=["accuracy_0.tmp_0", "reduce_mean_0.tmp_0"]) step += 1 acc_val, avg_loss = train_test( train_test_program=test_program, train_test_reader=test_reader, train_test_feed=feeder) print("Test with epoch %d, accuracy: %s , loss: %s" % (epoch_id, acc_val, avg_loss)) save_dir = (output_folder + "/epoch_%d") % epoch_id trainer.save_inference_program(output_folder) print("{} Train is ended".format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))