def init_distributed_infer_env(self, exe, loss, role_maker=None, dirname=None): import paddle.distributed.fleet as fleet if fleet.fleet._runtime_handle is None: fleet.init(role_maker=role_maker) fake_optimizer = paddle.optimizer.SGD() strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(fake_optimizer, strategy=strategy) optimizer.minimize(loss, startup_program=self.origin_startup_program) if fleet.is_server(): fleet.init_server(dirname=dirname) fleet.run_server() else: exe.run(paddle.static.default_startup_program()) fleet.init_worker() self._init_dense_params(exe, dirname) global_startup_program = paddle.static.default_startup_program() global_startup_program = self.origin_startup_program global_main_program = paddle.static.default_main_program() global_main_program = self.origin_main_program
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 6) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def train(args): import logging log.setLevel(logging.DEBUG) log.info("start") worker_num = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) num_devices = int(os.getenv("CPU_NUM", 10)) data = load_raw_edges_fn(args.edge_path, args.undirected) edges = data[0] weights = data[1] node2idx = data[2] num_nodes = len(node2idx) model = DeepwalkModel(num_nodes, args.hidden_size, args.neg_num, args.is_sparse, args.is_distributed, 1.) pyreader = model.pyreader loss = model.forward() # init fleet log.info("init_role") init_role() train_steps = math.ceil(1. * num_nodes * args.epoch / args.batch_size / num_devices / worker_num) log.info("Train step: %s" % train_steps) if args.optimizer == "sgd": args.lr *= args.batch_size * args.walk_len * args.win_size optimization(args.lr, loss, train_steps, args.optimizer) # init and run server or worker if fleet.is_server(): log.info("PS server mode") fleet.init_server() fleet.run_server() if fleet.is_worker(): log.info("start init worker done") exe = F.Executor(F.CPUPlace()) exe.run(F.default_startup_program()) log.info("Startup done") fleet.init_worker() #just the worker, load the sample log.info("init worker done") print("LEO num_nodes:",num_nodes, len(edges)) edges_feat={} edges_feat["weight"] = np.array(weights) graph = pgl.graph.Graph(num_nodes, edges, edge_feat=edges_feat) # bind gen gen_func = build_gen_func(args, graph) pyreader.decorate_tensor_provider(gen_func) train_prog(exe, F.default_main_program(), loss, pyreader, args, train_steps) print("fleet try to stop worker\r\n") fleet.stop_worker() print("Game over\r\n")
def run_offline_infer(self): logger.info("Run Offline Infer Begin") place = paddle.CPUPlace() self.exe = paddle.static.Executor(place) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() init_model_path = config.get("runner.init_model_path") model_mode = config.get("runner.model_mode", 0) if fleet.is_first_worker(): fleet.load_model(init_model_path, mode=model_mode) fleet.barrier_worker() logger.info("Prepare Dataset Begin.") prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset() prepare_data_end_time = time.time() logger.info("Prepare Dataset Done, using time {} second.".format( prepare_data_end_time - prepare_data_start_time)) infer_start_time = time.time() self.dataset_offline_infer(dataset) infer_end_time = time.time() logger.info("Infer Dataset Done, using time {} second.".format( infer_end_time - infer_start_time))
def test_gradient_merge_optimizer(self): fleet.init(role_maker.PaddleCloudRoleMaker()) x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) avg_cost = paddle.fluid.layers.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 0) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def test_communicator_sync(self): os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \ "127.0.0.1:36001,127.0.0.2:36001" fleet.init(role_maker.PaddleCloudRoleMaker()) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) fleet.init_worker() time.sleep(10) fleet.stop_worker()
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") gpus_env = os.getenv("FLAGS_selected_gpus") self.PSGPU = paddle.fluid.core.PSGPU() gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)] print("gpuslot: {}".format(gpuslot)) self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) opt_info = paddle.fluid.default_main_program()._fleet_opt opt_info['stat_var_names'] = [] for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif sync_mode == "gpubox": self.dataset_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ") pass_per_day = int(self.config.get("runner.pass_per_day")) for day_index in range(len(days)): day = days[day_index] for pass_index in range(1, pass_per_day + 1): logger.info("Day: {} Pass: {} Begin.".format(day, pass_index)) prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset(day, pass_index) prepare_data_end_time = time.time() logger.info( "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time)) train_start_time = time.time() self.dataset_train_loop(dataset, day, pass_index) train_end_time = time.time() logger.info( "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time)) model_dir = "{}/{}/{}".format(save_model_path, day, pass_index) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=2) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=0)
def do_dataset_training(self, fleet): train_file_list = ctr_dataset_reader.prepare_fake_data() exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() thread_num = 2 batch_size = 128 filelist = train_file_list # config dataset dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_use_var(self.feeds) dataset.set_batch_size(128) dataset.set_thread(2) dataset.set_filelist(filelist) dataset.set_pipe_command('python ctr_dataset_reader.py') dataset.load_into_memory() dataset.global_shuffle(fleet, 12) ##TODO: thread configure shuffle_data_size = dataset.get_shuffle_data_size(fleet) local_data_size = dataset.get_shuffle_data_size() data_size_list = fleet.util.all_gather(local_data_size) print('after global_shuffle data_size_list: ', data_size_list) print('after global_shuffle data_size: ', shuffle_data_size) for epoch_id in range(1): pass_start = time.time() exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start dataset.release_memory() if os.getenv("SAVE_MODEL") == "1": model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None) if cache_dirname: fleet.save_cache_model(cache_dirname)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open( "./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open( "./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and not os.path.exists(save_model_path): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() fake_num_nodes = 1 py_reader, loss = StaticSkipGramModel( fake_num_nodes, args.neg_num, args.embed_size, sparse_embedding=True, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = build_graph(args) # bind gen train_ds = ShardedDataset(graph.nodes, args.epoch) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.cpu_batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) py_reader.set_batch_generator(lambda: data_loader) train_loss = train(exe, paddle.static.default_main_program(), py_reader, loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def test_ps_minimize(self): import paddle import paddle.distributed.fleet as fleet os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["PADDLE_TRAINER_ID"] = "1" input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_slot = paddle.fluid.layers.data( name="slot", shape=[1], dtype='int64') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') emb = paddle.fluid.layers.embedding( input=input_slot, size=[10, 9], is_sparse=True) input_x = paddle.concat(x=[input_x, emb], axis=1) fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) role = fleet.PaddleCloudRoleMaker(is_collective=False) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(paddle.static.default_startup_program()) pe = fluid.ParallelExecutor(use_cuda=False, loss_name=avg_cost.name) compiled_prog = fluid.compiler.CompiledProgram( fluid.default_main_program()) fleet.init_worker() fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost]) fleet.fleet.save( dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost]) fleet.fleet.save(dirname="/tmp") fleet.load_model(path="/tmp", mode=0) fleet.load_model(path="/tmp", mode=1)
def main(args): paddle.set_device("cpu") paddle.enable_static() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel( num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker()
def test_a_sync_optimizer_trainer(self): os.environ["TRAINING_ROLE"] = "TRAINER" import paddle.distributed.fleet as fleet main_program = paddle.fluid.Program() startup_program = paddle.fluid.Program() paddle.fluid.framework.switch_main_program(main_program) paddle.fluid.framework.switch_startup_program(startup_program) fleet.init(role_maker.PaddleCloudRoleMaker()) input_x = paddle.fluid.layers.data(name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64') fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) prog = paddle.fluid.default_main_program() self.assertNotEqual(prog.global_block().ops[-1].type, "send_barrier") sends = 0 sgds = 0 for op in prog.global_block().ops: if op.type == "send": sends += 1 if op.type == "sgd": sgds += 1 self.assertEqual(sends, 7) self.assertEqual(sgds, 0) fleet.init_worker() time.sleep(8) fleet.stop_worker()
def do_pyreader_training(self, fleet): """ do training using dataset, using fetch handler to catch variable Args: fleet(Fleet api): the fleet object of Parameter Server, define distribute training role """ exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() batch_size = 4 train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size) self.reader.decorate_sample_list_generator(train_reader) for epoch_id in range(1): self.reader.start() try: pass_start = time.time() while True: loss_val = exe.run(program=fluid.default_main_program(), fetch_list=[self.avg_cost.name]) loss_val = np.mean(loss_val) # TODO(randomly fail) # reduce_output = fleet.util.all_reduce( # np.array(loss_val), mode="sum") # loss_all_trainer = fleet.util.all_gather(float(loss_val)) # loss_val = float(reduce_output) / len(loss_all_trainer) message = "TRAIN ---> pass: {} loss: {}\n".format( epoch_id, loss_val) fleet.util.print_on_rank(message, 0) pass_time = time.time() - pass_start except fluid.core.EOFException: self.reader.reset() dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir)
def do_dataset_training_queuedataset(self, fleet): train_file_list = ctr_dataset_reader.prepare_fake_data() exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() thread_num = 2 batch_size = 128 filelist = train_file_list # config dataset dataset = paddle.distributed.QueueDataset() pipe_command = 'python ctr_dataset_reader.py' dataset.init(batch_size=batch_size, use_var=self.feeds, pipe_command=pipe_command, thread_num=thread_num) dataset.set_filelist(filelist) for epoch_id in range(1): pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start if os.getenv("SAVE_MODEL") == "1": model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname)
def run_trainer(self, role, strategy): place = fluid.core.CPUPlace() exe = fluid.Executor(place) fleet.init(role) avg_cost, x, z, y = self.net() optimizer = fluid.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) exe.run(fluid.default_startup_program()) fleet.init_worker() train_reader = paddle.batch(self.fake_reader(), batch_size=24) feeder = fluid.DataFeeder(place=place, feed_list=[x, z, y]) for batch_id, data in enumerate(train_reader()): exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[]) fleet.stop_worker()
def test_communicator_async(self): role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.WORKER, worker_num=2, server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) fleet.init(role) avg_cost = self.net() optimizer = fluid.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"launch_barrier": False} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) os.environ["TEST_MODE"] = "1" fleet.init_worker() time.sleep(10) fleet.stop_worker()
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() self.online_intervals = get_online_pass_interval( self.split_interval, self.split_per_pass, False) if is_local(self.save_model_path) and self.save_model_path and ( not os.path.exists(self.save_model_path)): os.makedirs(self.save_model_path) last_day, last_pass, last_path, xbox_base_key = get_last_save_model( self.save_model_path, self.hadoop_client) logger.info( "get_last_save_model last_day = {}, last_pass = {}, last_path = {}, xbox_base_key = {}". format(last_day, last_pass, last_path, xbox_base_key)) if last_day != -1 and fleet.is_first_worker(): load_model(last_path, 0, self.hadoop_client) fleet.barrier_worker() day = self.start_day infer_first = True while int(day) <= int(self.end_day): logger.info("training a new day {}, end_day = {}".format( day, self.end_day)) if last_day != -1 and int(day) < last_day: day = get_next_day(day) continue # base_model_saved = False for pass_id in range(1, 1 + len(self.online_intervals)): print(last_day, day, last_pass, pass_id) if (last_day != -1 and int(day) == last_day) and ( last_pass != -1 and int(pass_id) <= last_pass): continue if self.save_first_base and fleet.is_first_worker(): self.save_first_base = False last_base_day, last_base_path, tmp_xbox_base_key = \ get_last_save_xbox_base(self.save_model_path, self.hadoop_client) logger.info( "get_last_save_xbox_base, last_base_day = {}, last_base_path = {}, tmp_xbox_base_key = {}". format(last_base_day, last_base_path, tmp_xbox_base_key)) if int(day) > last_base_day: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) elif int(day) == last_base_day: xbox_base_key = tmp_xbox_base_key fleet.barrier_worker() logger.info("training a new day = {} new pass = {}".format( day, pass_id)) logger.info("Day:{}, Pass: {}, Prepare Dataset Begin.".format( day, pass_id)) begin_train = time.time() begin = time.time() dataset = self.wait_and_prepare_dataset(day, pass_id) end = time.time() read_data_cost = (end - begin) / 60.0 logger.info("Prepare Dataset Done, using time {} mins.".format( read_data_cost)) infer_cost = 0 infer_metric_cost = 0 if infer_first: infer_first = False else: logger.info("Day:{}, Pass: {}, Infering Dataset Begin.". format(day, pass_id)) begin = time.time() self.dataset_infer_loop(dataset, day, pass_id) end = time.time() infer_cost = (end - begin) / 60.0 logger.info("Infering Dataset Done, using time {} mins.". format(infer_cost)) begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Infer Global Metric: {}". format(day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() infer_metric_cost = (end - begin) / 60.0 logger.info("Day:{}, Pass: {}, Training Dataset Begin.".format( day, pass_id)) begin = time.time() self.dataset_train_loop(dataset, day, pass_id, self.need_train_dump) end = time.time() avg_cost = get_avg_cost_mins(end - begin) get_max_cost_mins(end - begin) get_min_cost_mins(end - begin) train_cost = avg_cost logger.info("Training Dataset Done, using time {} mins.". format(train_cost)) begin = time.time() dataset.release_memory() end = time.time() release_cost = (end - begin) / 60.0 begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Train Global Metric: {}".format( day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() metric_cost = (end - begin) / 60 end_train = time.time() total_cost = (end_train - begin_train) / 60 other_cost = total_cost - read_data_cost - train_cost - release_cost - metric_cost - infer_cost - infer_metric_cost log_str = "finished train epoch %d time cost:%s min job time cost" \ ":[read_data:%s min][train: %s min][metric: %s min][release: %s min]" \ "[infer:%s min][infer_metric: %s min][other:%s min]" \ % (pass_id, total_cost, read_data_cost, train_cost, metric_cost, release_cost, infer_cost, infer_metric_cost, other_cost) logger.info(log_str) if self.need_infer_dump: prepare_data_start_time = time.time() dump_dataset = self.wait_and_prepare_infer_dataset(day, pass_id) prepare_data_end_time = time.time() logger.info( "Prepare Infer Dump Dataset Done, using time {} second.". format(prepare_data_end_time - prepare_data_start_time)) dump_start_time = time.time() self.dataset_infer_loop(dump_dataset, day, pass_id, True) dump_end_time = time.time() logger.info( "Infer Dump Dataset Done, using time {} second.". format(dump_end_time - dump_start_time)) dump_dataset.release_memory() if fleet.is_first_worker(): if pass_id % self.checkpoint_per_pass == 0: save_model(self.exe, self.save_model_path, day, pass_id) write_model_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client) if pass_id % self.save_delta_frequency == 0: last_xbox_day, last_xbox_pass, last_xbox_path, _ = get_last_save_xbox( self.save_model_path, self.hadoop_client) if int(day) < last_xbox_day or int( day) == last_xbox_day and int( pass_id) <= last_xbox_pass: log_str = "delta model exists" logger.info(log_str) else: save_xbox_model(self.save_model_path, day, pass_id, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) # 1 delta write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) fleet.barrier_worker() logger.info("shrink table") begin = time.time() fleet.shrink() end = time.time() logger.info("shrink table done, cost %s min" % ( (end - begin) / 60.0)) if fleet.is_first_worker(): last_base_day, last_base_path, last_base_key = get_last_save_xbox_base( self.save_model_path, self.hadoop_client) logger.info( "one epoch finishes, get_last_save_xbox, last_base_day = {}, last_base_path = {}, last_base_key = {}". format(last_base_day, last_base_path, last_base_key)) next_day = get_next_day(day) if int(next_day) <= last_base_day: logger.info("batch model/base xbox model exists") else: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, next_day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) save_batch_model(self.exe, self.save_model_path, next_day) write_model_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) fleet.barrier_worker() day = get_next_day(day)
def fit(): EPOCH_NUM = 3 BATCH_SIZE = 128 type_size = 10 role = role_maker.UserDefinedRoleMaker( current_id=0, role=role_maker.Role.SERVER, worker_num=2, server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True type_size = createDataList('F:/机器学习/CNN/train', 'D:/cnn/cnn.model.data' + "/") # 用于训练的数据提供器 train_reader = dataReader("D:/cnn/cnn.model.data/trainer.list") train_reader = paddle.batch(paddle.reader.shuffle(reader=train_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) test_reader = dataReader("D:/cnn/cnn.model.data/test.list") test_reader = paddle.batch(paddle.reader.shuffle(reader=test_reader, buf_size=BATCH_SIZE * 100), batch_size=BATCH_SIZE) data_shape = [3, 32, 32] paddle.enable_static() images = fluid.layers.data(name='images', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # 获取分类器 predict = networkConfiguration(images, type_size) # 定义损失函数和准确率 cost = fluid.layers.cross_entropy(input=predict, label=label) # 交叉熵 avg_cost = fluid.layers.mean(cost) # 计算cost中所有元素的平均值 acc = fluid.layers.accuracy(input=predict, label=label) # 使用输入和标签计算准确率 # 定义优化方法 test_program = fluid.default_main_program().clone(for_test=True) # 获取测试程序 optimizer = fluid.optimizer.Adam(learning_rate=0.001) # 定义优化方法 optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): fleet.init_worker() ########## 模型训练&模型评估 ########## # 创建Executor use_cuda = False # 定义使用CPU还是GPU,使用CPU时use_cuda=False place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print("完成") # 定义数据映射器 feeder = fluid.DataFeeder(feed_list=[images, label], place=place) for pass_id in range(EPOCH_NUM): # 开始训练 for batch_id, data in enumerate(train_reader()): # 遍历train_reader train_cost, train_acc = exe.run( program=fluid.default_main_program(), # 运行主程序 feed=feeder.feed(data), # 喂入一个batch的数据 fetch_list=[avg_cost, acc]) # fetch均方误差和准确率 # 每100次batch打印一次训练、进行一次测试 if batch_id % 20 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0])) # 开始测试 test_costs = [] # 测试的损失值 test_accs = [] # 测试的准确率 for batch_id, data in enumerate(test_reader()): test_cost, test_acc = exe.run( program=test_program, # 执行训练程序 feed=feeder.feed(data), # 喂入数据 fetch_list=[avg_cost, acc]) # fetch误差、准确率 test_costs.append(test_cost[0]) # 记录每个batch的损失值 test_accs.append(test_acc[0]) # 记录每个batch的准确率 test_cost = (sum(test_costs) / len(test_costs)) # 计算误差平均值 test_acc = (sum(test_accs) / len(test_accs)) # 计算准确率平均值 print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc)) save(predict, "D:/cnn/cnn.model", exe)
# limitations under the License. import os import fleetx as X import paddle.fluid as fluid import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker configs = X.parse_train_configs() role = role_maker.PaddleCloudRoleMaker() fleet.init(role) model = X.applications.MultiSlotCTR() loader = model.load_multislot_from_file('./ctr_data/train_data') dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = False dist_strategy.a_sync = True optimizer = fluid.optimizer.SGD(learning_rate=0.0001) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(model.loss) if fleet.is_server(): fleet.init_server() fleet.run_server() else: fleet.init_worker() trainer = X.Trainer(fluid.CPUPlace()) trainer.fit(model, loader, epoch=10)
def test_init_worker(): """test_barrier_worker""" assert fleet.init_worker() is None print("{} ... ok".format(sys._getframe().f_code.co_name))
def test_init_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) if fleet.is_worker(): fleet.init_worker()
def main(args): role = role_maker.PaddleCloudRoleMaker() fleet.init(role) data = pgl.dataset.RedditDataset(args.normalize, args.symmetry) log.info("Preprocess finish") log.info("Train Examples: %s" % len(data.train_index)) log.info("Val Examples: %s" % len(data.val_index)) log.info("Test Examples: %s" % len(data.test_index)) log.info("Num nodes %s" % data.graph.num_nodes) log.info("Num edges %s" % data.graph.num_edges) log.info("Average Degree %s" % np.mean(data.graph.indegree())) graph = data.graph train_index = data.train_index val_index = data.val_index test_index = data.test_index train_label = data.train_label val_label = data.val_label test_label = data.test_label loss, acc = build_net( input_size=data.feature.shape[-1], num_class=data.num_classes, hidden_size=args.hidden_size, num_layers=len(args.samples)) test_program = paddle.static.default_main_program().clone(for_test=True) strategy = fleet.DistributedStrategy() strategy.a_sync = True optimizer = paddle.fluid.optimizer.Adam(learning_rate=args.lr) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) if fleet.is_server(): fleet.init_server() fleet.run_server() else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() train_ds = ShardedDataset(train_index, train_label) valid_ds = ShardedDataset(val_index, val_label) test_ds = ShardedDataset(test_index, test_label) collate_fn = partial(batch_fn, graph=graph, samples=args.samples) train_loader = Dataloader( train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) valid_loader = Dataloader( valid_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) test_loader = Dataloader( test_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) compiled_prog, cpu_num = setup_compiled_prog(loss) for epoch in tqdm.tqdm(range(args.epoch)): train_loss, train_acc = run(train_loader, data.feature, exe, compiled_prog, loss, acc, phase="train", cpu_num=cpu_num) valid_loss, valid_acc = run(valid_loader, data.feature, exe, test_program, loss, acc, phase="valid", cpu_num=1) log.info("Epoch %s Valid-Loss %s Valid-Acc %s" % (epoch, valid_loss, valid_acc)) test_loss, test_acc = run(test_loader, data.feature, exe, test_program, loss, acc, phase="test", cpu_num=1) log.info("Epoch %s Test-Loss %s Test-Acc %s" % (epoch, test_loss, test_acc)) fleet.stop_worker()
def test_communicator_ps_gpu(self): with open("test_communicator_ps_gpu.txt", "w") as f: data = "1 0.6 1 0.7\n" f.write(data) os.environ["PADDLE_PSERVER_NUMS"] = "2" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["PADDLE_TRAINER_ID"] = "0" os.environ["PADDLE_TRAINERS_NUM"] = "2" os.environ[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001" os.environ[ "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002" os.environ["TRAINING_ROLE"] = "TRAINER" os.environ["FLAGS_selected_gpus"] = "0" role = role_maker.PaddleCloudRoleMaker() fleet.init(role) x = fluid.layers.data(name='x', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') slots_vars = [x, y] cost = fluid.layers.square_error_cost(input=x, label=y) avg_cost = fluid.layers.mean(cost) optimizer = fluid.optimizer.Adam(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = { "launch_barrier": False, "use_ps_gpu": 1, } startup_program = paddle.static.Program() main_program = paddle.static.Program() optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) dataset = paddle.distributed.InMemoryDataset() dataset.init(batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars) dataset.set_filelist(["test_communicator_ps_gpu.txt"]) dataset.set_date("20211111") dataset.load_into_memory(is_shuffle=True) os.environ["TEST_MODE"] = "1" exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) main_program._fleet_opt = {"stat_var_names": [x.name]} fleet.init_worker() try: exe.train_from_dataset(main_program, dataset) except ImportError as e: pass except Exception as e: self.assertTrue(False) time.sleep(10) fleet.stop_worker() os.remove("./test_communicator_ps_gpu.txt")
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel(num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) cpu_num = int(os.environ.get('CPU_NUM', 1)) if int(cpu_num) > 1: parallel_places = [paddle.CPUPlace()] * cpu_num exec_strategy = paddle.static.ExecutionStrategy() exec_strategy.num_threads = int(cpu_num) build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce compiled_prog = paddle.static.CompiledProgram( paddle.static.default_main_program()).with_data_parallel( loss_name=loss.name, places=parallel_places, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = paddle.static.default_main_program() for epoch in range(args.epoch): train_loss = train(exe, compiled_prog, data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())