Example #1
0
    def end_day(self, context):
        """R
        """
        day = self._train_pass.date()
        pass_id = self._train_pass._pass_id
        xbox_base_key = int(time.time())
        context['status'] = 'begin_day'

        util.rank0_print("shrink table")
        cost_printer = util.CostPrinter(util.print_cost,
                                        {'master': True, 'log_format': 'shrink table done, cost %s sec'})
        fleet.shrink_sparse_table()
        for executor in self._exector_context:
            self._exector_context[executor]['model'].shrink({
                'scope': self._exector_context[executor]['scope'],
                'decay': self.global_config['optimizer']['dense_decay_rate']
            })
        cost_printer.done()

        next_date = self._train_pass.date(delta_day=1)
        util.rank0_print("going to save xbox base model")
        self.save_xbox_model(next_date, 0, xbox_base_key, "")
        util.rank0_print("going to save batch model")
        self.save_model(next_date, 0, xbox_base_key)
        self._train_pass._base_key = xbox_base_key
        fleet._role_maker._barrier_worker()
Example #2
0
 def startup(self, context):
     """R
     """
     if fleet.is_server():
         fleet.run_server()
         context['status'] = 'wait'
         return
     stdout_str = ""
     self._train_pass = util.TimeTrainPass(self.global_config)
     if not self.global_config['cold_start']:
         cost_printer = util.CostPrinter(util.print_cost,
                                         {'master': True, 'log_format': 'load model cost %s sec',
                                          'stdout': stdout_str})
         self.print_log("going to load model %s" % self._train_pass._checkpoint_model_path, {'master': True})
         # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= self._train_pass.date()
         #    and config.reqi_dnn_plugin_pass >= self._pass_id:
         #    fleet.load_one_table(0, self._train_pass._checkpoint_model_path)
         # else:
         fleet.init_server(self._train_pass._checkpoint_model_path, mode=0)
         cost_printer.done()
     if self.global_config['save_first_base']:
         self.print_log("save_first_base=True", {'master': True})
         self.print_log("going to save xbox base model", {'master': True, 'stdout': stdout_str})
         self._train_pass._base_key = int(time.time())
         stdout_str += self.save_xbox_model(self._train_pass.date(), 0, self._train_pass._base_key, "")
     context['status'] = 'begin_day'
Example #3
0
 def save_model(self, day, pass_index, base_key):
     """R
     """
     cost_printer = util.CostPrinter(util.print_cost,
                                     {'master': True, 'log_format': 'save model cost %s sec'})
     model_path = self._path_generator.generate_path('batch_model', {'day': day, 'pass_id': pass_index})
     save_mode = 0  # just save all
     if pass_index < 1:  # batch_model
         save_mode = 3  # unseen_day++, save all
     util.rank0_print("going to save_model %s" % model_path)
     fleet.save_persistables(None, model_path, mode=save_mode)
     if fleet._role_maker.is_first_worker():
         self._train_pass.save_train_progress(day, pass_index, base_key, model_path, is_checkpoint=True)
     cost_printer.done()
     return model_path
Example #4
0
    def train_pass(self, context):
        """R
        """
        stdout_str = ""
        day = self._train_pass.date()
        pass_id = self._train_pass._pass_id
        base_key = self._train_pass._base_key
        pass_time = self._train_pass._current_train_time.strftime("%Y%m%d%H%M")
        self.print_log("    ==== begin delta:%s ========" % pass_id, {'master': True, 'stdout': stdout_str})
        train_begin_time = time.time()

        cost_printer = util.CostPrinter(util.print_cost, \
                                        {'master': True, 'log_format': 'load into memory done, cost %s sec',
                                         'stdout': stdout_str})
        current_dataset = {}
        for name in self._dataset:
            current_dataset[name] = self._dataset[name].load_dataset({
                'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(),
                'begin_time': pass_time, 'time_window_min': self._train_pass._interval_per_pass
            })
        fleet._role_maker._barrier_worker()
        cost_printer.done()

        util.rank0_print("going to global shuffle")
        cost_printer = util.CostPrinter(util.print_cost, {
            'master': True, 'stdout': stdout_str,
            'log_format': 'global shuffle done, cost %s sec'})
        for name in current_dataset:
            current_dataset[name].global_shuffle(fleet, self.global_config['dataset']['shuffle_thread'])
        cost_printer.done()
        # str(dataset.get_shuffle_data_size(fleet))
        fleet._role_maker._barrier_worker()

        if self.global_config['prefetch_data']:
            next_pass_time = (self._train_pass._current_train_time +
                              datetime.timedelta(minutes=self._train_pass._interval_per_pass)).strftime("%Y%m%d%H%M")
            for name in self._dataset:
                self._dataset[name].preload_dataset({
                    'node_num': fleet.worker_num(), 'node_idx': fleet.worker_index(),
                    'begin_time': next_pass_time, 'time_window_min': self._train_pass._interval_per_pass
                })

        fleet._role_maker._barrier_worker()
        pure_train_begin = time.time()
        for executor in self.global_config['executor']:
            self.run_executor(executor, current_dataset[executor['dataset_name']], stdout_str)
        cost_printer = util.CostPrinter(util.print_cost, \
                                        {'master': True, 'log_format': 'release_memory cost %s sec'})
        for name in current_dataset:
            current_dataset[name].release_memory()
        pure_train_cost = time.time() - pure_train_begin

        if self._train_pass.is_checkpoint_pass(pass_id):
            self.save_model(day, pass_id, base_key)

        train_end_time = time.time()
        train_cost = train_end_time - train_begin_time
        other_cost = train_cost - pure_train_cost
        log_str = "finished train day %s pass %s time cost:%s sec job time cost:" % (day, pass_id, train_cost)
        for executor in self._exector_context:
            log_str += '[' + executor + ':' + str(self._exector_context[executor]['cost']) + ']'
        log_str += '[other_cost:' + str(other_cost) + ']'
        util.rank0_print(log_str)
        stdout_str += util.now_time_str() + log_str
        sys.stdout.write(stdout_str)
        fleet._role_maker._barrier_worker()
        stdout_str = ""
        if pass_id == self._train_pass.max_pass_num_day():
            context['status'] = 'end_day'
            return
        elif not self._train_pass.next():
            context['is_exit'] = True
Example #5
0
    def save_xbox_model(self, day, pass_index, xbox_base_key, monitor_data):
        """R
        """
        stdout_str = ""
        xbox_patch_id = str(int(time.time()))
        util.rank0_print("begin save delta model")

        model_path = ""
        xbox_model_donefile = ""
        cost_printer = util.CostPrinter(util.print_cost, {'master': True, \
                                                          'log_format': 'save xbox model cost %s sec',
                                                          'stdout': stdout_str})
        if pass_index < 1:
            save_mode = 2
            xbox_patch_id = xbox_base_key
            model_path = self._path_generator.generate_path('xbox_base', {'day': day})
            xbox_model_donefile = self._path_generator.generate_path('xbox_base_done', {'day': day})
        else:
            save_mode = 1
            model_path = self._path_generator.generate_path('xbox_delta', {'day': day, 'pass_id': pass_index})
            xbox_model_donefile = self._path_generator.generate_path('xbox_delta_done', {'day': day})
        total_save_num = fleet.save_persistables(None, model_path, mode=save_mode)
        cost_printer.done()

        cost_printer = util.CostPrinter(util.print_cost, {'master': True,
                                                          'log_format': 'save cache model cost %s sec',
                                                          'stdout': stdout_str})
        model_file_handler = fs.FileHandler(self.global_config['io']['afs'])
        if self.global_config['save_cache_model']:
            cache_save_num = fleet.save_cache_model(None, model_path, mode=save_mode)
            model_file_handler.write(
                "file_prefix:part\npart_num:16\nkey_num:%d\n" % cache_save_num,
                model_path + '/000_cache/sparse_cache.meta', 'w')
        cost_printer.done()
        util.rank0_print("save xbox cache model done, key_num=%s" % cache_save_num)

        save_env_param = {
            'executor': self._exe,
            'save_combine': True
        }
        cost_printer = util.CostPrinter(util.print_cost, {'master': True,
                                                          'log_format': 'save dense model cost %s sec',
                                                          'stdout': stdout_str})
        if fleet._role_maker.is_first_worker():
            for executor in self.global_config['executor']:
                if 'layer_for_inference' not in executor:
                    continue
                executor_name = executor['name']
                model = self._exector_context[executor_name]['model']
                save_env_param['inference_list'] = executor['layer_for_inference']
                save_env_param['scope'] = self._exector_context[executor_name]['scope']
                model.dump_inference_param(save_env_param)
                for dnn_layer in executor['layer_for_inference']:
                    model_file_handler.cp(dnn_layer['save_file_name'],
                                          model_path + '/dnn_plugin/' + dnn_layer['save_file_name'])
        fleet._role_maker._barrier_worker()
        cost_printer.done()

        xbox_done_info = {
            "id": xbox_patch_id,
            "key": xbox_base_key,
            "ins_path": "",
            "ins_tag": "feasign",
            "partition_type": "2",
            "record_count": "111111",
            "monitor_data": monitor_data,
            "mpi_size": str(fleet.worker_num()),
            "input": model_path.rstrip("/") + "/000",
            "job_id": util.get_env_value("JOB_ID"),
            "job_name": util.get_env_value("JOB_NAME")
        }
        if fleet._role_maker.is_first_worker():
            model_file_handler.write(json.dumps(xbox_done_info) + "\n", xbox_model_donefile, 'a')
            if pass_index > 0:
                self._train_pass.save_train_progress(day, pass_index, xbox_base_key, model_path, is_checkpoint=False)
        fleet._role_maker._barrier_worker()
        return stdout_str