def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ") pass_per_day = int(self.config.get("runner.pass_per_day")) for day_index in range(len(days)): day = days[day_index] for pass_index in range(1, pass_per_day + 1): logger.info("Day: {} Pass: {} Begin.".format(day, pass_index)) prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset(day, pass_index) prepare_data_end_time = time.time() logger.info( "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time)) train_start_time = time.time() self.dataset_train_loop(dataset, day, pass_index) train_end_time = time.time() logger.info( "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time)) model_dir = "{}/{}/{}".format(save_model_path, day, pass_index) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=2) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=0)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") gpus_env = os.getenv("FLAGS_selected_gpus") self.PSGPU = paddle.fluid.core.PSGPU() gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)] print("gpuslot: {}".format(gpuslot)) self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) opt_info = paddle.fluid.default_main_program()._fleet_opt opt_info['stat_var_names'] = [] for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif sync_mode == "gpubox": self.dataset_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open( "./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open( "./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and not os.path.exists(save_model_path): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)