Exemple #1
0
 def startup(self, context):
     """R
     """
     if fleet.is_server():
         fleet.run_server()
         context['status'] = 'wait'
         return
     stdout_str = ""
     self._train_pass = util.TimeTrainPass(self.global_config)
     if not self.global_config['cold_start']:
         cost_printer = util.CostPrinter(
             util.print_cost, {
                 'master': True,
                 'log_format': 'load model cost %s sec',
                 'stdout': stdout_str
             })
         self.print_log(
             "going to load model %s" %
             self._train_pass._checkpoint_model_path, {'master': True})
         # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date()
         #    and config.reqi_dnn_plugin_pass >= self._pass_id:
         #    fleet.load_one_table(0, self._train_pass._checkpoint_model_path)
         # else:
         fleet.init_server(self._train_pass._checkpoint_model_path, mode=0)
         cost_printer.done()
     if self.global_config['save_first_base']:
         self.print_log("save_first_base=True", {'master': True})
         self.print_log("going to save xbox base model", {
             'master': True,
             'stdout': stdout_str
         })
         self._train_pass._base_key = int(time.time())
         stdout_str += self.save_xbox_model(self._train_pass.date(), 0,
                                            self._train_pass._base_key, "")
     context['status'] = 'begin_day'
    def processor_register(self):
        role = MPISymetricRoleMaker()
        fleet.init(role)

        if fleet.is_server():
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)
            self.regist_context_processor('server_pass', self.server)
        else:
            self.regist_context_processor('uninit', self.instance)
            self.regist_context_processor('init_pass', self.init)
            self.regist_context_processor('train_pass', self.train)
            self.regist_context_processor('terminal_pass', self.terminal)
    def init(self, context):
        """R
        """
        self.model.train_net()
        optimizer = self.model.optimizer()

        optimizer = fleet.distributed_optimizer(optimizer,
                                                strategy={"use_cvm": False})
        optimizer.minimize(self.model.get_avg_cost())

        if fleet.is_server():
            context['status'] = 'server_pass'
        else:
            self.fetch_vars = []
            self.fetch_alias = []
            self.fetch_period = self.model.get_fetch_period()

            metrics = self.model.get_metrics()
            if metrics:
                self.fetch_vars = metrics.values()
                self.fetch_alias = metrics.keys()
            context['status'] = 'train_pass'
Exemple #4
0
        "join_5.w_0", "join_5.b_0", "join_6.w_0", "join_6.b_0", "join_7.w_0",
        "join_7.b_0"
    ]
    common_save_params = [
        "common.batch_size", "common.batch_sum", "common.batch_square_sum",
        "common_0.w_0", "common_0.b_0", "common_1.w_0", "common_1.b_0",
        "common_2.w_0", "common_2.b_0", "common_3.w_0", "common_3.b_0",
        "common_4.w_0", "common_4.b_0", "common_5.w_0", "common_5.b_0",
        "common_6.w_0", "common_6.b_0", "common_7.w_0", "common_7.b_0"
    ]
    update_save_params = [
        "fc_0.w_0", "fc_0.b_0", "fc_1.w_0", "fc_1.b_0", "fc_2.w_0", "fc_2.b_0",
        "fc_3.w_0", "fc_3.b_0", "fc_4.w_0", "fc_4.b_0", "fc_5.w_0", "fc_5.b_0"
    ]

    if fleet.is_server():
        fleet.run_server()
    elif fleet.is_worker():
        with fluid.scope_guard(scope3):
            exe.run(update_model._startup_program)
        with fluid.scope_guard(scope2):
            exe.run(join_common_model._startup_program)

        configs = {
            "fs.default.name": config.fs_name,
            "hadoop.job.ugi": config.fs_ugi
        }
        hdfs_client = HDFSClient("$HADOOP_HOME", configs)

        save_first_base = config.save_first_base
        path = config.train_data_path
Exemple #5
0
    def init(self, context):
        """R
        """
        role_maker = None
        if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu':
            afs_config = self.global_config['io']['afs']
            role_maker = GeneralRoleMaker(
                hdfs_name=afs_config['fs_name'],
                hdfs_ugi=afs_config['fs_ugi'],
                path=self.global_config['output_path'] + "/gloo",
                init_timeout_seconds=1200,
                run_timeout_seconds=1200)
        fleet.init(role_maker)
        data_var_list = []
        data_var_name_dict = {}
        runnnable_scope = []
        runnnable_cost_op = []
        context['status'] = 'startup'

        for executor in self.global_config['executor']:
            scope = fluid.Scope()
            self._exector_context[executor['name']] = {}
            self._exector_context[executor['name']]['scope'] = scope
            self._exector_context[
                executor['name']]['model'] = model_basic.create(executor)
            model = self._exector_context[executor['name']]['model']
            self._metrics.update(model.get_metrics())
            runnnable_scope.append(scope)
            runnnable_cost_op.append(model.get_cost_op())
            for var in model._data_var:
                if var.name in data_var_name_dict:
                    continue
                data_var_list.append(var)
                data_var_name_dict[var.name] = var

        optimizer = model_basic.YamlModel.build_optimizer({
            'metrics':
            self._metrics,
            'optimizer_conf':
            self.global_config['optimizer']
        })
        optimizer.minimize(runnnable_cost_op, runnnable_scope)
        for executor in self.global_config['executor']:
            scope = self._exector_context[executor['name']]['scope']
            model = self._exector_context[executor['name']]['model']
            program = model._build_param['model']['train_program']
            if not executor['is_update_sparse']:
                program._fleet_opt["program_configs"][str(
                    id(model.get_cost_op().block.program)
                )]["push_sparse"] = []
            if 'train_thread_num' not in executor:
                executor['train_thread_num'] = self.global_config[
                    'train_thread_num']
            with fluid.scope_guard(scope):
                self._exe.run(model._build_param['model']['startup_program'])
            model.dump_model_program('./')

        # server init done
        if fleet.is_server():
            return 0

        self._dataset = {}
        for dataset_item in self.global_config['dataset']['data_list']:
            dataset_item['data_vars'] = data_var_list
            dataset_item.update(self.global_config['io']['afs'])
            dataset_item["batch_size"] = self.global_config['batch_size']
            self._dataset[dataset_item[
                'name']] = dataset.FluidTimeSplitDataset(dataset_item)
        # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
        #    util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
        fleet.init_worker()
        pass
    def do_training(self, args=None):
        """do training"""
        avg_cost = self.net()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init()
        # optimizer
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"})
        optimizer.minimize(avg_cost)
        train_info = []
        # 启动server
        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            train_data_path = 'thirdparty/data/dist_data/pslib/train_data'
            train_data_files = []
            for filename in os.listdir(train_data_path):
                train_data_files.append(os.path.join(train_data_path, filename))
            # fleet dataset
            label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False)
            data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1)
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
            dataset.set_use_var([label, data])
            dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist(train_data_files)
            # 把数据读到内存
            dataset.load_into_memory()
            # 本地shuffle
            dataset.local_shuffle()
            # 初始化worker配置
            fleet.init_worker()
            exe.run(fluid.default_startup_program())
            PASS_NUM = 1
            for pass_id in range(PASS_NUM):
                var_dict = {"loss": avg_cost}
                global var_dict

                class FetchVars(fluid.executor.FetchHandler):
                    def __init__(self, var_dict=None, period_secs=2):
                        super(FetchVars, self).__init__(var_dict, period_secs=2)

                    def handler(self, res_dict):
                        train_info.extend(res_dict["loss"])
                        print(train_info)

                exe.train_from_dataset(
                    program=fluid.default_main_program(),
                    dataset=dataset,
                    fetch_handler=FetchVars(var_dict))
            dataset.release_memory()
            fleet.shrink_sparse_table()
            fleet.shrink_dense_table(0.01, 11)
            fleet.print_table_stat(0)
            fleet.clear_one_table(0)
            fleet.clear_model()
        fleet.stop_worker()
        return train_info