def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ") pass_per_day = int(self.config.get("runner.pass_per_day")) for day_index in range(len(days)): day = days[day_index] for pass_index in range(1, pass_per_day + 1): logger.info("Day: {} Pass: {} Begin.".format(day, pass_index)) prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset(day, pass_index) prepare_data_end_time = time.time() logger.info( "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time)) train_start_time = time.time() self.dataset_train_loop(dataset, day, pass_index) train_end_time = time.time() logger.info( "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time)) model_dir = "{}/{}/{}".format(save_model_path, day, pass_index) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=2) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=0)
def run_offline_infer(self): logger.info("Run Offline Infer Begin") place = paddle.CPUPlace() self.exe = paddle.static.Executor(place) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() init_model_path = config.get("runner.init_model_path") model_mode = config.get("runner.model_mode", 0) if fleet.is_first_worker(): fleet.load_model(init_model_path, mode=model_mode) fleet.barrier_worker() logger.info("Prepare Dataset Begin.") prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset() prepare_data_end_time = time.time() logger.info("Prepare Dataset Done, using time {} second.".format( prepare_data_end_time - prepare_data_start_time)) infer_start_time = time.time() self.dataset_offline_infer(dataset) infer_end_time = time.time() logger.info("Infer Dataset Done, using time {} second.".format( infer_end_time - infer_start_time))
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") gpus_env = os.getenv("FLAGS_selected_gpus") self.PSGPU = paddle.fluid.core.PSGPU() gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)] print("gpuslot: {}".format(gpuslot)) self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) opt_info = paddle.fluid.default_main_program()._fleet_opt opt_info['stat_var_names'] = [] for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif sync_mode == "gpubox": self.dataset_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open( "./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open( "./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and not os.path.exists(save_model_path): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() fake_num_nodes = 1 py_reader, loss = StaticSkipGramModel( fake_num_nodes, args.neg_num, args.embed_size, sparse_embedding=True, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = build_graph(args) # bind gen train_ds = ShardedDataset(graph.nodes, args.epoch) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.cpu_batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) py_reader.set_batch_generator(lambda: data_loader) train_loss = train(exe, paddle.static.default_main_program(), py_reader, loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def main(args): paddle.enable_static() paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) fleet.init(is_collective=True) graph = load(args.dataset) loss = StaticSkipGramModel(graph.num_nodes, args.neg_num, args.embed_size, num_emb_part=args.num_emb_part, shared_embedding=args.shared_embedding) optimizer = F.optimizer.Adam(args.learning_rate) dist_strategy = fleet.DistributedStrategy() dist_strategy.sharding = True dist_strategy.sharding_configs = { "segment_anchors": None, "sharding_segment_strategy": "segment_broadcast_MB", "segment_broadcast_MB": 32, "sharding_degree": int(paddle.distributed.get_world_size()), } optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) for epoch in range(args.epoch): train_loss = train(exe, paddle.static.default_main_program(), data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps): step = 0 node2vec_pyreader.start() while True: try: begin_time = time.time() loss_val, = exe.run(program, fetch_list=[loss]) log.info("step %s: loss %.5f speed: %.5f s/step" % (step, np.mean(loss_val), time.time() - begin_time)) step += 1 except F.core.EOFException: node2vec_pyreader.reset() if step % args.steps_per_save == 0 or step == train_steps: if fleet.is_first_worker(): model_save_dir = args.save_path model_path = os.path.join(model_save_dir, str(step)) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) fleet.save_persistables(exe, model_path) if step == train_steps: break
def main(args): paddle.set_device("cpu") paddle.enable_static() fleet.init() if args.num_nodes is None: num_nodes = load(args.dataset).num_nodes else: num_nodes = args.num_nodes loss = StaticSkipGramModel(num_nodes, args.neg_num, args.embed_size, sparse=True) optimizer = F.optimizer.Adam(args.learning_rate, lazy_mode=True) dist_strategy = fleet.DistributedStrategy() dist_strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) optimizer.minimize(loss) # init and run server or worker if fleet.is_server(): fleet.init_server() fleet.run_server() if fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) fleet.init_worker() graph = load(args.dataset) # bind gen train_ds = ShardedDataset(graph.nodes) collate_fn = BatchRandWalk(graph, args.walk_len, args.win_size, args.neg_num, args.neg_sample_type) data_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.sample_workers, collate_fn=collate_fn) cpu_num = int(os.environ.get('CPU_NUM', 1)) if int(cpu_num) > 1: parallel_places = [paddle.CPUPlace()] * cpu_num exec_strategy = paddle.static.ExecutionStrategy() exec_strategy.num_threads = int(cpu_num) build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce compiled_prog = paddle.static.CompiledProgram( paddle.static.default_main_program()).with_data_parallel( loss_name=loss.name, places=parallel_places, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = paddle.static.default_main_program() for epoch in range(args.epoch): train_loss = train(exe, compiled_prog, data_loader, loss) log.info("Runing epoch:%s\t train_loss:%.6f", epoch, train_loss) fleet.stop_worker() if fleet.is_first_worker(): fleet.save_persistables(exe, "./model", paddle.static.default_main_program())
def test_is_first_worker(): """test_is_first_worker""" assert fleet.is_first_worker() == True print("{} ... ok".format(sys._getframe().f_code.co_name))
def test_is_first_worker(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) if fleet.is_first_worker(): print("test fleet first worker done.")
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() self.online_intervals = get_online_pass_interval( self.split_interval, self.split_per_pass, False) if is_local(self.save_model_path) and self.save_model_path and ( not os.path.exists(self.save_model_path)): os.makedirs(self.save_model_path) last_day, last_pass, last_path, xbox_base_key = get_last_save_model( self.save_model_path, self.hadoop_client) logger.info( "get_last_save_model last_day = {}, last_pass = {}, last_path = {}, xbox_base_key = {}". format(last_day, last_pass, last_path, xbox_base_key)) if last_day != -1 and fleet.is_first_worker(): load_model(last_path, 0, self.hadoop_client) fleet.barrier_worker() day = self.start_day infer_first = True while int(day) <= int(self.end_day): logger.info("training a new day {}, end_day = {}".format( day, self.end_day)) if last_day != -1 and int(day) < last_day: day = get_next_day(day) continue # base_model_saved = False for pass_id in range(1, 1 + len(self.online_intervals)): print(last_day, day, last_pass, pass_id) if (last_day != -1 and int(day) == last_day) and ( last_pass != -1 and int(pass_id) <= last_pass): continue if self.save_first_base and fleet.is_first_worker(): self.save_first_base = False last_base_day, last_base_path, tmp_xbox_base_key = \ get_last_save_xbox_base(self.save_model_path, self.hadoop_client) logger.info( "get_last_save_xbox_base, last_base_day = {}, last_base_path = {}, tmp_xbox_base_key = {}". format(last_base_day, last_base_path, tmp_xbox_base_key)) if int(day) > last_base_day: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) elif int(day) == last_base_day: xbox_base_key = tmp_xbox_base_key fleet.barrier_worker() logger.info("training a new day = {} new pass = {}".format( day, pass_id)) logger.info("Day:{}, Pass: {}, Prepare Dataset Begin.".format( day, pass_id)) begin_train = time.time() begin = time.time() dataset = self.wait_and_prepare_dataset(day, pass_id) end = time.time() read_data_cost = (end - begin) / 60.0 logger.info("Prepare Dataset Done, using time {} mins.".format( read_data_cost)) infer_cost = 0 infer_metric_cost = 0 if infer_first: infer_first = False else: logger.info("Day:{}, Pass: {}, Infering Dataset Begin.". format(day, pass_id)) begin = time.time() self.dataset_infer_loop(dataset, day, pass_id) end = time.time() infer_cost = (end - begin) / 60.0 logger.info("Infering Dataset Done, using time {} mins.". format(infer_cost)) begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Infer Global Metric: {}". format(day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() infer_metric_cost = (end - begin) / 60.0 logger.info("Day:{}, Pass: {}, Training Dataset Begin.".format( day, pass_id)) begin = time.time() self.dataset_train_loop(dataset, day, pass_id, self.need_train_dump) end = time.time() avg_cost = get_avg_cost_mins(end - begin) get_max_cost_mins(end - begin) get_min_cost_mins(end - begin) train_cost = avg_cost logger.info("Training Dataset Done, using time {} mins.". format(train_cost)) begin = time.time() dataset.release_memory() end = time.time() release_cost = (end - begin) / 60.0 begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Train Global Metric: {}".format( day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() metric_cost = (end - begin) / 60 end_train = time.time() total_cost = (end_train - begin_train) / 60 other_cost = total_cost - read_data_cost - train_cost - release_cost - metric_cost - infer_cost - infer_metric_cost log_str = "finished train epoch %d time cost:%s min job time cost" \ ":[read_data:%s min][train: %s min][metric: %s min][release: %s min]" \ "[infer:%s min][infer_metric: %s min][other:%s min]" \ % (pass_id, total_cost, read_data_cost, train_cost, metric_cost, release_cost, infer_cost, infer_metric_cost, other_cost) logger.info(log_str) if self.need_infer_dump: prepare_data_start_time = time.time() dump_dataset = self.wait_and_prepare_infer_dataset(day, pass_id) prepare_data_end_time = time.time() logger.info( "Prepare Infer Dump Dataset Done, using time {} second.". format(prepare_data_end_time - prepare_data_start_time)) dump_start_time = time.time() self.dataset_infer_loop(dump_dataset, day, pass_id, True) dump_end_time = time.time() logger.info( "Infer Dump Dataset Done, using time {} second.". format(dump_end_time - dump_start_time)) dump_dataset.release_memory() if fleet.is_first_worker(): if pass_id % self.checkpoint_per_pass == 0: save_model(self.exe, self.save_model_path, day, pass_id) write_model_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client) if pass_id % self.save_delta_frequency == 0: last_xbox_day, last_xbox_pass, last_xbox_path, _ = get_last_save_xbox( self.save_model_path, self.hadoop_client) if int(day) < last_xbox_day or int( day) == last_xbox_day and int( pass_id) <= last_xbox_pass: log_str = "delta model exists" logger.info(log_str) else: save_xbox_model(self.save_model_path, day, pass_id, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) # 1 delta write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) fleet.barrier_worker() logger.info("shrink table") begin = time.time() fleet.shrink() end = time.time() logger.info("shrink table done, cost %s min" % ( (end - begin) / 60.0)) if fleet.is_first_worker(): last_base_day, last_base_path, last_base_key = get_last_save_xbox_base( self.save_model_path, self.hadoop_client) logger.info( "one epoch finishes, get_last_save_xbox, last_base_day = {}, last_base_path = {}, last_base_key = {}". format(last_base_day, last_base_path, last_base_key)) next_day = get_next_day(day) if int(next_day) <= last_base_day: logger.info("batch model/base xbox model exists") else: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, next_day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) save_batch_model(self.exe, self.save_model_path, next_day) write_model_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) fleet.barrier_worker() day = get_next_day(day)