def parse(self, *args, **kwargs): x_data = [] y_data = [] use_raw = kwargs.get("use_raw") for f in args: if not (f and FileOps.exists(f)): continue with open(f) as fin: if self.process_func: res = list( map(self.process_func, [line.strip() for line in fin.readlines()])) else: res = [line.strip().split() for line in fin.readlines()] for tup in res: if not len(tup): continue if use_raw: x_data.append(tup) else: x_data.append(tup[0]) if not self.is_test_data: if len(tup) > 1: y_data.append(tup[1]) else: y_data.append(0) self.x = np.array(x_data) self.y = np.array(y_data)
def parse(self, *args, **kwargs): x_data = [] y_data = [] label = kwargs.pop("label") if "label" in kwargs else "" usecols = kwargs.get("usecols", "") if usecols and isinstance(usecols, str): usecols = usecols.split(",") if len(usecols): if label and label not in usecols: usecols.append(label) kwargs["usecols"] = usecols for f in args: if isinstance(f, (dict, list)): res = self.parse_json(f, **kwargs) else: if not (f and FileOps.exists(f)): continue res = pd.read_csv(f, **kwargs) if self.process_func and callable(self.process_func): res = self.process_func(res) if label: if label not in res.columns: continue y = res[label] y_data.append(y) res.drop(label, axis=1, inplace=True) x_data.append(res) if not x_data: return self.x = pd.concat(x_data) self.y = pd.concat(y_data)
def train(self, train_data, valid_data=None, **kwargs): if callable(self.estimator): self.estimator = self.estimator() if self.fine_tune and FileOps.exists(self.model_save_path): self.finetune() self.has_load = True varkw = self.parse_kwargs(self.estimator.train, **kwargs) return self.estimator.train(train_data=train_data, valid_data=valid_data, **varkw)
def save(self, model_url="", model_name=None): mname = model_name or self.model_name if os.path.isfile(self.model_save_path): self.model_save_path, mname = os.path.split(self.model_save_path) FileOps.clean_folder([self.model_save_path], clean=False) model_path = FileOps.join_path(self.model_save_path, mname) self.estimator.save(model_path) if model_url and FileOps.exists(model_path): FileOps.upload(model_path, model_url) model_path = model_url return model_path
def run(self): while self.run_flag: time.sleep(self.check_time) conf = FileOps.download(self.hot_update_conf) if not (conf and FileOps.exists(conf)): continue with open(conf, "r") as fin: try: conf_msg = json.load(fin) model_msg = conf_msg["model_config"] latest_version = str(model_msg["model_update_time"]) model = FileOps.download( model_msg["model_path"], FileOps.join_path(self.temp_path, f"model.{latest_version}")) except (json.JSONDecodeError, KeyError): LOGGER.error(f"fail to parse model hot update config: " f"{self.hot_update_conf}") continue if not (model and FileOps.exists(model)): continue if latest_version == self.version: continue self.version = latest_version with self.MODEL_MANIPULATION_SEM: LOGGER.info(f"Update model start with version {self.version}") try: self.production_estimator.load(model) status = K8sResourceKindStatus.COMPLETED.value LOGGER.info(f"Update model complete " f"with version {self.version}") except Exception as e: LOGGER.error(f"fail to update model: {e}") status = K8sResourceKindStatus.FAILED.value if self.callback: self.callback(task_info=None, status=status, kind="deploy") gc.collect()
def load(self, task_index_url=None): """ load task_detail (tasks/models etc ...) from task index file. It'll automatically loaded during `inference` and `evaluation` phases. Parameters ---------- task_index_url : str task index file path, default self.task_index_url. """ if task_index_url: self.task_index_url = task_index_url assert FileOps.exists(self.task_index_url), FileExistsError( f"Task index miss: {self.task_index_url}") task_index = FileOps.load(self.task_index_url) self.extractor = task_index['extractor'] if isinstance(self.extractor, str): self.extractor = FileOps.load(self.extractor) self.task_groups = task_index['task_groups'] self.models = [task.model for task in self.task_groups]
def train(self, train_data, valid_data=None, post_process=None, action="initial", **kwargs): """ fit for update the knowledge based on training data. Parameters ---------- train_data : BaseDataSource Train data, see `sedna.datasources.BaseDataSource` for more detail. valid_data : BaseDataSource Valid data, BaseDataSource or None. post_process : function function or a registered method, callback after `estimator` train. action : str `update` or `initial` the knowledge base kwargs : Dict parameters for `estimator` training, Like: `early_stopping_rounds` in Xgboost.XGBClassifier Returns ------- train_history : object """ callback_func = None if post_process is not None: callback_func = ClassFactory.get_cls(ClassType.CALLBACK, post_process) res, task_index_url = self.estimator.train( train_data=train_data, valid_data=valid_data, **kwargs ) # todo: Distinguishing incremental update and fully overwrite if isinstance(task_index_url, str) and FileOps.exists(task_index_url): task_index = FileOps.load(task_index_url) else: task_index = task_index_url extractor = task_index['extractor'] task_groups = task_index['task_groups'] model_upload_key = {} for task in task_groups: model_file = task.model.model save_model = FileOps.join_path(self.config.output_url, os.path.basename(model_file)) if model_file not in model_upload_key: model_upload_key[model_file] = FileOps.upload( model_file, save_model) model_file = model_upload_key[model_file] try: model = self.kb_server.upload_file(save_model) except Exception as err: self.log.error( f"Upload task model of {model_file} fail: {err}") model = set_backend( estimator=self.estimator.estimator.base_model) model.load(model_file) task.model.model = model for _task in task.tasks: sample_dir = FileOps.join_path( self.config.output_url, f"{_task.samples.data_type}_{_task.entry}.sample") task.samples.save(sample_dir) try: sample_dir = self.kb_server.upload_file(sample_dir) except Exception as err: self.log.error( f"Upload task samples of {_task.entry} fail: {err}") _task.samples.data_url = sample_dir save_extractor = FileOps.join_path( self.config.output_url, KBResourceConstant.TASK_EXTRACTOR_NAME.value) extractor = FileOps.dump(extractor, save_extractor) try: extractor = self.kb_server.upload_file(extractor) except Exception as err: self.log.error(f"Upload task extractor fail: {err}") task_info = {"task_groups": task_groups, "extractor": extractor} fd, name = tempfile.mkstemp() FileOps.dump(task_info, name) index_file = self.kb_server.update_db(name) if not index_file: self.log.error(f"KB update Fail !") index_file = name FileOps.upload(index_file, self.config.task_index) task_info_res = self.estimator.model_info( self.config.task_index, relpath=self.config.data_path_prefix) self.report_task_info(None, K8sResourceKindStatus.COMPLETED.value, task_info_res) self.log.info(f"Lifelong learning Train task Finished, " f"KB idnex save in {self.config.task_index}") return callback_func(self.estimator, res) if callback_func else res
def _get_db_index(self): _index_path = FileOps.join_path(self.save_dir, self.kb_index) if not FileOps.exists(_index_path): # todo: get from kb pass return _index_path