Exemple #1
0
 def parse(self, *args, **kwargs):
     x_data = []
     y_data = []
     use_raw = kwargs.get("use_raw")
     for f in args:
         if not (f and FileOps.exists(f)):
             continue
         with open(f) as fin:
             if self.process_func:
                 res = list(
                     map(self.process_func,
                         [line.strip() for line in fin.readlines()]))
             else:
                 res = [line.strip().split() for line in fin.readlines()]
         for tup in res:
             if not len(tup):
                 continue
             if use_raw:
                 x_data.append(tup)
             else:
                 x_data.append(tup[0])
                 if not self.is_test_data:
                     if len(tup) > 1:
                         y_data.append(tup[1])
                     else:
                         y_data.append(0)
     self.x = np.array(x_data)
     self.y = np.array(y_data)
Exemple #2
0
 def parse(self, *args, **kwargs):
     x_data = []
     y_data = []
     label = kwargs.pop("label") if "label" in kwargs else ""
     usecols = kwargs.get("usecols", "")
     if usecols and isinstance(usecols, str):
         usecols = usecols.split(",")
     if len(usecols):
         if label and label not in usecols:
             usecols.append(label)
         kwargs["usecols"] = usecols
     for f in args:
         if isinstance(f, (dict, list)):
             res = self.parse_json(f, **kwargs)
         else:
             if not (f and FileOps.exists(f)):
                 continue
             res = pd.read_csv(f, **kwargs)
         if self.process_func and callable(self.process_func):
             res = self.process_func(res)
         if label:
             if label not in res.columns:
                 continue
             y = res[label]
             y_data.append(y)
             res.drop(label, axis=1, inplace=True)
         x_data.append(res)
     if not x_data:
         return
     self.x = pd.concat(x_data)
     self.y = pd.concat(y_data)
Exemple #3
0
 def train(self, train_data, valid_data=None, **kwargs):
     if callable(self.estimator):
         self.estimator = self.estimator()
     if self.fine_tune and FileOps.exists(self.model_save_path):
         self.finetune()
     self.has_load = True
     varkw = self.parse_kwargs(self.estimator.train, **kwargs)
     return self.estimator.train(train_data=train_data,
                                 valid_data=valid_data,
                                 **varkw)
Exemple #4
0
    def save(self, model_url="", model_name=None):
        mname = model_name or self.model_name
        if os.path.isfile(self.model_save_path):
            self.model_save_path, mname = os.path.split(self.model_save_path)

        FileOps.clean_folder([self.model_save_path], clean=False)
        model_path = FileOps.join_path(self.model_save_path, mname)
        self.estimator.save(model_path)
        if model_url and FileOps.exists(model_path):
            FileOps.upload(model_path, model_url)
            model_path = model_url
        return model_path
Exemple #5
0
 def run(self):
     while self.run_flag:
         time.sleep(self.check_time)
         conf = FileOps.download(self.hot_update_conf)
         if not (conf and FileOps.exists(conf)):
             continue
         with open(conf, "r") as fin:
             try:
                 conf_msg = json.load(fin)
                 model_msg = conf_msg["model_config"]
                 latest_version = str(model_msg["model_update_time"])
                 model = FileOps.download(
                     model_msg["model_path"],
                     FileOps.join_path(self.temp_path,
                                       f"model.{latest_version}"))
             except (json.JSONDecodeError, KeyError):
                 LOGGER.error(f"fail to parse model hot update config: "
                              f"{self.hot_update_conf}")
                 continue
         if not (model and FileOps.exists(model)):
             continue
         if latest_version == self.version:
             continue
         self.version = latest_version
         with self.MODEL_MANIPULATION_SEM:
             LOGGER.info(f"Update model start with version {self.version}")
             try:
                 self.production_estimator.load(model)
                 status = K8sResourceKindStatus.COMPLETED.value
                 LOGGER.info(f"Update model complete "
                             f"with version {self.version}")
             except Exception as e:
                 LOGGER.error(f"fail to update model: {e}")
                 status = K8sResourceKindStatus.FAILED.value
             if self.callback:
                 self.callback(task_info=None, status=status, kind="deploy")
         gc.collect()
    def load(self, task_index_url=None):
        """
        load task_detail (tasks/models etc ...) from task index file.
        It'll automatically loaded during `inference` and `evaluation` phases.

        Parameters
        ----------
        task_index_url : str
            task index file path, default self.task_index_url.
        """

        if task_index_url:
            self.task_index_url = task_index_url
        assert FileOps.exists(self.task_index_url), FileExistsError(
            f"Task index miss: {self.task_index_url}")
        task_index = FileOps.load(self.task_index_url)
        self.extractor = task_index['extractor']
        if isinstance(self.extractor, str):
            self.extractor = FileOps.load(self.extractor)
        self.task_groups = task_index['task_groups']
        self.models = [task.model for task in self.task_groups]
Exemple #7
0
    def train(self,
              train_data,
              valid_data=None,
              post_process=None,
              action="initial",
              **kwargs):
        """
        fit for update the knowledge based on training data.

        Parameters
        ----------
        train_data : BaseDataSource
            Train data, see `sedna.datasources.BaseDataSource` for more detail.
        valid_data : BaseDataSource
            Valid data, BaseDataSource or None.
        post_process : function
            function or a registered method, callback after `estimator` train.
        action : str
            `update` or `initial` the knowledge base
        kwargs : Dict
            parameters for `estimator` training, Like:
            `early_stopping_rounds` in Xgboost.XGBClassifier

        Returns
        -------
        train_history : object
        """

        callback_func = None
        if post_process is not None:
            callback_func = ClassFactory.get_cls(ClassType.CALLBACK,
                                                 post_process)
        res, task_index_url = self.estimator.train(
            train_data=train_data, valid_data=valid_data, **kwargs
        )  # todo: Distinguishing incremental update and fully overwrite

        if isinstance(task_index_url, str) and FileOps.exists(task_index_url):
            task_index = FileOps.load(task_index_url)
        else:
            task_index = task_index_url

        extractor = task_index['extractor']
        task_groups = task_index['task_groups']

        model_upload_key = {}
        for task in task_groups:
            model_file = task.model.model
            save_model = FileOps.join_path(self.config.output_url,
                                           os.path.basename(model_file))
            if model_file not in model_upload_key:
                model_upload_key[model_file] = FileOps.upload(
                    model_file, save_model)
            model_file = model_upload_key[model_file]

            try:
                model = self.kb_server.upload_file(save_model)
            except Exception as err:
                self.log.error(
                    f"Upload task model of {model_file} fail: {err}")
                model = set_backend(
                    estimator=self.estimator.estimator.base_model)
                model.load(model_file)
            task.model.model = model

            for _task in task.tasks:
                sample_dir = FileOps.join_path(
                    self.config.output_url,
                    f"{_task.samples.data_type}_{_task.entry}.sample")
                task.samples.save(sample_dir)
                try:
                    sample_dir = self.kb_server.upload_file(sample_dir)
                except Exception as err:
                    self.log.error(
                        f"Upload task samples of {_task.entry} fail: {err}")
                _task.samples.data_url = sample_dir

        save_extractor = FileOps.join_path(
            self.config.output_url,
            KBResourceConstant.TASK_EXTRACTOR_NAME.value)
        extractor = FileOps.dump(extractor, save_extractor)
        try:
            extractor = self.kb_server.upload_file(extractor)
        except Exception as err:
            self.log.error(f"Upload task extractor fail: {err}")
        task_info = {"task_groups": task_groups, "extractor": extractor}
        fd, name = tempfile.mkstemp()
        FileOps.dump(task_info, name)

        index_file = self.kb_server.update_db(name)
        if not index_file:
            self.log.error(f"KB update Fail !")
            index_file = name
        FileOps.upload(index_file, self.config.task_index)

        task_info_res = self.estimator.model_info(
            self.config.task_index, relpath=self.config.data_path_prefix)
        self.report_task_info(None, K8sResourceKindStatus.COMPLETED.value,
                              task_info_res)
        self.log.info(f"Lifelong learning Train task Finished, "
                      f"KB idnex save in {self.config.task_index}")
        return callback_func(self.estimator, res) if callback_func else res
Exemple #8
0
 def _get_db_index(self):
     _index_path = FileOps.join_path(self.save_dir, self.kb_index)
     if not FileOps.exists(_index_path):  # todo: get from kb
         pass
     return _index_path