def save_xbox_model(output_path, day, pass_id, exe, feed_vars, target_vars, client): """ save xbox model Args: output_path(str): output path day(str|int): training day pass_id(str|int): training pass id exe(Executor) feed_vars(list): feed var list for inference model target_vars: target var list for inference model client(HDFSClient): hadoop client """ if pass_id != -1: mode = 1 suffix_name = "/%s/delta-%s/" % (day, pass_id) model_path = output_path.rstrip("/") + suffix_name else: mode = 2 suffix_name = "/%s/base/" % day model_path = output_path.rstrip("/") + suffix_name fleet.save_inference_model( exe, model_path, [feed.name for feed in feed_vars], target_vars, mode=mode) if not is_local(model_path): client.upload("./dnn_plugin", model_path)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") gpus_env = os.getenv("FLAGS_selected_gpus") self.PSGPU = paddle.fluid.core.PSGPU() gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)] print("gpuslot: {}".format(gpuslot)) self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) opt_info = paddle.fluid.default_main_program()._fleet_opt opt_info['stat_var_names'] = [] for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif sync_mode == "gpubox": self.dataset_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ") pass_per_day = int(self.config.get("runner.pass_per_day")) for day_index in range(len(days)): day = days[day_index] for pass_index in range(1, pass_per_day + 1): logger.info("Day: {} Pass: {} Begin.".format(day, pass_index)) prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset(day, pass_index) prepare_data_end_time = time.time() logger.info( "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time)) train_start_time = time.time() self.dataset_train_loop(dataset, day, pass_index) train_end_time = time.time() logger.info( "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time)) model_dir = "{}/{}/{}".format(save_model_path, day, pass_index) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=2) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=0)
def do_dataset_training(self, fleet): train_file_list = ctr_dataset_reader.prepare_fake_data() exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() thread_num = 2 batch_size = 128 filelist = train_file_list # config dataset dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_use_var(self.feeds) dataset.set_batch_size(128) dataset.set_thread(2) dataset.set_filelist(filelist) dataset.set_pipe_command('python ctr_dataset_reader.py') dataset.load_into_memory() dataset.global_shuffle(fleet, 12) ##TODO: thread configure shuffle_data_size = dataset.get_shuffle_data_size(fleet) local_data_size = dataset.get_shuffle_data_size() data_size_list = fleet.util.all_gather(local_data_size) print('after global_shuffle data_size_list: ', data_size_list) print('after global_shuffle data_size: ', shuffle_data_size) for epoch_id in range(1): pass_start = time.time() exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start dataset.release_memory() if os.getenv("SAVE_MODEL") == "1": model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None) if cache_dirname: fleet.save_cache_model(cache_dirname)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open( "./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open( "./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and not os.path.exists(save_model_path): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def do_pyreader_training(self, fleet): """ do training using dataset, using fetch handler to catch variable Args: fleet(Fleet api): the fleet object of Parameter Server, define distribute training role """ exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() batch_size = 4 train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size) self.reader.decorate_sample_list_generator(train_reader) for epoch_id in range(1): self.reader.start() try: pass_start = time.time() while True: loss_val = exe.run(program=fluid.default_main_program(), fetch_list=[self.avg_cost.name]) loss_val = np.mean(loss_val) # TODO(randomly fail) # reduce_output = fleet.util.all_reduce( # np.array(loss_val), mode="sum") # loss_all_trainer = fleet.util.all_gather(float(loss_val)) # loss_val = float(reduce_output) / len(loss_all_trainer) message = "TRAIN ---> pass: {} loss: {}\n".format( epoch_id, loss_val) fleet.util.print_on_rank(message, 0) pass_time = time.time() - pass_start except fluid.core.EOFException: self.reader.reset() dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir)
def do_dataset_training_queuedataset(self, fleet): train_file_list = ctr_dataset_reader.prepare_fake_data() exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() thread_num = 2 batch_size = 128 filelist = train_file_list # config dataset dataset = paddle.distributed.QueueDataset() pipe_command = 'python ctr_dataset_reader.py' dataset.init(batch_size=batch_size, use_var=self.feeds, pipe_command=pipe_command, thread_num=thread_num) dataset.set_filelist(filelist) for epoch_id in range(1): pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start if os.getenv("SAVE_MODEL") == "1": model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname)