Beispiel #1
0
def save_xbox_model(output_path, day, pass_id, exe, feed_vars, target_vars,
                    client):
    """
    save xbox model

    Args:
        output_path(str): output path
        day(str|int): training day
        pass_id(str|int): training pass id
        exe(Executor)
        feed_vars(list): feed var list for inference model
        target_vars: target var list for inference model
        client(HDFSClient): hadoop client
    """
    if pass_id != -1:
        mode = 1
        suffix_name = "/%s/delta-%s/" % (day, pass_id)
        model_path = output_path.rstrip("/") + suffix_name
    else:
        mode = 2
        suffix_name = "/%s/base/" % day
        model_path = output_path.rstrip("/") + suffix_name
    fleet.save_inference_model(
        exe,
        model_path, [feed.name for feed in feed_vars],
        target_vars,
        mode=mode)
    if not is_local(model_path):
        client.upload("./dnn_plugin", model_path)
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        gpus_env = os.getenv("FLAGS_selected_gpus")
        self.PSGPU = paddle.fluid.core.PSGPU()
        gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)]
        print("gpuslot: {}".format(gpuslot))
        self.PSGPU.set_slot_vector(gpuslot)
        self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")])
        opt_info = paddle.fluid.default_main_program()._fleet_opt
        opt_info['stat_var_names'] = []
        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif sync_mode == "gpubox":
                self.dataset_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var)
Beispiel #3
0
    def run_online_worker(self):
        logger.info("Run Online Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ")
        pass_per_day = int(self.config.get("runner.pass_per_day"))

        for day_index in range(len(days)):
            day = days[day_index]
            for pass_index in range(1, pass_per_day + 1):
                logger.info("Day: {} Pass: {} Begin.".format(day, pass_index))
                
                prepare_data_start_time = time.time()
                dataset = self.wait_and_prepare_dataset(day, pass_index)
                prepare_data_end_time = time.time()
                logger.info(
                    "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time))
                
                train_start_time = time.time()
                self.dataset_train_loop(dataset, day, pass_index)
                train_end_time = time.time()
                logger.info(
                    "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time))
            
                model_dir = "{}/{}/{}".format(save_model_path, day, pass_index)

                if fleet.is_first_worker() and save_model_path and is_distributed_env():
                    fleet.save_inference_model(
                        self.exe, model_dir,
                        [feed.name for feed in self.input_data],
                        self.inference_target_var,
                        mode=2)

            if fleet.is_first_worker() and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var,
                    mode=0)
Beispiel #4
0
    def do_dataset_training(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()

        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        thread_num = 2
        batch_size = 128
        filelist = train_file_list

        # config dataset
        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
        dataset.set_use_var(self.feeds)
        dataset.set_batch_size(128)
        dataset.set_thread(2)
        dataset.set_filelist(filelist)
        dataset.set_pipe_command('python ctr_dataset_reader.py')
        dataset.load_into_memory()

        dataset.global_shuffle(fleet, 12)  ##TODO: thread configure
        shuffle_data_size = dataset.get_shuffle_data_size(fleet)
        local_data_size = dataset.get_shuffle_data_size()
        data_size_list = fleet.util.all_gather(local_data_size)
        print('after global_shuffle data_size_list: ', data_size_list)
        print('after global_shuffle data_size: ', shuffle_data_size)

        for epoch_id in range(1):
            pass_start = time.time()
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=dataset,
                                   fetch_list=[self.avg_cost],
                                   fetch_info=["cost"],
                                   print_period=2,
                                   debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start
        dataset.release_memory()

        if os.getenv("SAVE_MODEL") == "1":
            model_dir = tempfile.mkdtemp()
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)

        cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None)
        if cache_dirname:
            fleet.save_cache_model(cache_dirname)
Beispiel #5
0
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open(
                "./{}_worker_main_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open(
                "./{}_worker_startup_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and not os.path.exists(save_model_path):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name
                     for feed in self.input_data], self.inference_target_var)
    def do_pyreader_training(self, fleet):
        """
        do training using dataset, using fetch handler to catch variable
        Args:
            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
        """
        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        batch_size = 4
        train_reader = paddle.batch(fake_ctr_reader(), batch_size=batch_size)
        self.reader.decorate_sample_list_generator(train_reader)

        for epoch_id in range(1):
            self.reader.start()
            try:
                pass_start = time.time()
                while True:
                    loss_val = exe.run(program=fluid.default_main_program(),
                                       fetch_list=[self.avg_cost.name])
                    loss_val = np.mean(loss_val)
                    # TODO(randomly fail)
                    #   reduce_output = fleet.util.all_reduce(
                    #       np.array(loss_val), mode="sum")
                    #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                    #   loss_val = float(reduce_output) / len(loss_all_trainer)
                    message = "TRAIN ---> pass: {} loss: {}\n".format(
                        epoch_id, loss_val)
                    fleet.util.print_on_rank(message, 0)

                pass_time = time.time() - pass_start
            except fluid.core.EOFException:
                self.reader.reset()

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)

        model_dir = tempfile.mkdtemp()
        fleet.save_inference_model(exe, model_dir,
                                   [feed.name for feed in self.feeds],
                                   self.avg_cost)
        self.check_model_right(model_dir)
        shutil.rmtree(model_dir)
    def do_dataset_training_queuedataset(self, fleet):
        train_file_list = ctr_dataset_reader.prepare_fake_data()

        exe = self.get_executor()
        exe.run(fluid.default_startup_program())
        fleet.init_worker()

        thread_num = 2
        batch_size = 128
        filelist = train_file_list

        # config dataset
        dataset = paddle.distributed.QueueDataset()
        pipe_command = 'python ctr_dataset_reader.py'

        dataset.init(batch_size=batch_size,
                     use_var=self.feeds,
                     pipe_command=pipe_command,
                     thread_num=thread_num)

        dataset.set_filelist(filelist)

        for epoch_id in range(1):
            pass_start = time.time()
            dataset.set_filelist(filelist)
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=dataset,
                                   fetch_list=[self.avg_cost],
                                   fetch_info=["cost"],
                                   print_period=2,
                                   debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start

        if os.getenv("SAVE_MODEL") == "1":
            model_dir = tempfile.mkdtemp()
            fleet.save_inference_model(exe, model_dir,
                                       [feed.name for feed in self.feeds],
                                       self.avg_cost)
            self.check_model_right(model_dir)
            shutil.rmtree(model_dir)

        dirname = os.getenv("SAVE_DIRNAME", None)
        if dirname:
            fleet.save_persistables(exe, dirname=dirname)