def _update_model(self, session, model_name, progress, values): values['last_update_datetime'] = util.get_now_datetime() values['progress'] = progress affected = session.query(ModelEntity) \ .filter(ModelEntity.name == model_name) \ .update(values) if affected != 1: # rollback them all raise Exception(f"Update error, affected rows={affected}")
def escaped_time_by_seconds(self): if self.status in [ModelStatusType.Succeed, ModelStatusType.Failed]: if self.finish_datetime is None: raise Exception( f"Internal error, model name = {self.name} train finished but has no finish_datetime. " ) escaped = util.datetime_diff(self.finish_datetime, self.create_datetime) else: escaped = util.datetime_diff(util.get_now_datetime(), self.create_datetime) return escaped
def escaped_time(self): if self.status in [ModelStatusType.Succeed, ModelStatusType.Failed]: if self.finish_datetime is None: raise Exception( "Internal error, train finished but has no finish_datetime. " ) escaped = util.datetime_diff_human_format_by_minute( self.finish_datetime, self.create_datetime) else: escaped = util.datetime_diff_human_format_by_minute( util.get_now_datetime(), self.create_datetime) return escaped
def train_process_terminated(self, model_name): with db.open_session() as s: # because of check database 1 seconds every time fix read is running but before handle_models finished. # 1. check status, only running can change to finished model = self.model_dao.find_by_name(s, model_name) if model.status == ModelStatusType.Running: _now = util.get_now_datetime() properties = { "status": ModelStatusType.Failed, "finish_datetime": _now, "last_update_datetime": _now } self.model_dao.update_model_by_name(s, model_name, properties) else: log.warning(f"Train process is already finished, model = {model_name}")
def create_temporary_model(self, session, model_name, no_experiment, input_features, experiment_conf: ExperimentConf, train_job_conf: TrainJobConf): # 1. create a temporary model now = util.get_now_datetime() extension = { "experiment_conf": experiment_conf.to_dict(), "train_job_conf": train_job_conf.to_dict(), } gbm_model = ModelEntity(name=model_name, framework=train_job_conf.framework, dataset_name=experiment_conf.dataset_name, no_experiment=no_experiment, inputs=input_features, task_type=experiment_conf.task_type, model_path=util.model_dir(experiment_conf.dataset_name, model_name), status=ModelStatusType.Running, train_job_name=train_job_conf.name, extension=extension, create_datetime=now, last_update_datetime=now) session.add(gbm_model)
def add_predict_process_step(self, model_name: str, job_name: str, step: JobStep): step_type = step.type with db.open_session() as s: # 1. check temporary model exists model = self.model_dao.require_by_name(s, model_name) # 2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 3. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message)
def add_train_process_step(self, train_job_name, req_dict): # [1]. read & check params step_type = util.require_in_dict(req_dict, 'type', str) step_status = util.require_in_dict(req_dict, 'status', str) step_extension = util.get_from_dict(req_dict, 'extension', dict) if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]: raise ValueError(f"Unknown step type = {step_type}") if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]: raise ValueError(f"Unknown status = {step_status}") # [2]. save message with db.open_session() as s: # [2.1]. check temporary model exists model = self.model_dao.find_by_train_job_name(s, train_job_name) model_name = model.name # [2.2]. check event type, one type one record messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]: raise Exception(f"Event type = {step_type} already exists .") # [2.3]. create a new message content = util.dumps(req_dict) message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # [2.4]. handle analyze event current_progress = model.progress # todo check in code body self._check_progress_change(step_type, current_progress) # add failed status if step_type == TrainStep.Types.Evaluate: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"performance": step_extension['performance']}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.Load: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.OptimizeStart: pass # train_trail_no = step_extension.get('trail_no') # if train_trail_no is None or not isinstance(train_trail_no, int): # raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}") # # upload trail number # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no}) elif step_type == TrainStep.Types.Optimize: train_trail_no = step_extension.get('trail_no') # update trails # load current trail and append new trails = model.trails if model.trails is None: trails = [] trails.append(step_extension) self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails}) elif step_type == TrainStep.Types.Persist: model_file_size = step_extension['model_file_size'] self._update_model(s, model_name, step_type, {"model_file_size": model_file_size, "status": ModelStatusType.Succeed, "finish_datetime": util.get_now_datetime()}) else: self._update_model(s, model_name, step_type, {})
def _create_temporary_dataset(self, source_type, file_path, took, sample_conf: SampleConf): now = util.get_now_datetime() file_name = P.basename(file_path) temporary_dataset_name = self.choose_temporary_dataset_name( file_name) # use a long name analyze_job_name = util.analyze_data_job_name( util.cut_suffix(file_name), now) file_size = P.getsize(file_path) # 2. create record td = DatasetEntity(name=temporary_dataset_name, file_size=file_size, is_temporary=True, status=DatasetEntity.Status.Created, source_type=source_type, file_path=file_path, file_name=file_name, create_datetime=now, last_update_datetime=now) with db.open_session() as s: s.add(td) # 3. send file transfer step if source_type == DatasetEntity.SourceType.Upload: step = JobStep(type=AnalyzeStep.Types.Upload, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) elif source_type == DatasetEntity.SourceType.Import: step = JobStep(type=AnalyzeStep.Types.Copy, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) # 4. create analyze config conf = AnalyzeJobConf(job_name=analyze_job_name, dataset_name=temporary_dataset_name, sample_conf=sample_conf, path=file_path, temporary_dataset=True, label_col=None) # 5. start new process analyze_config_string = util.dumps(conf.to_dict()) logger.info(f"Analyze job conf: {analyze_config_string}") python_executable = sys.executable temporary_dataset_dir = util.temporary_dataset_dir( temporary_dataset_name) os.makedirs(temporary_dataset_dir, exist_ok=True) std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log") command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {std_log}") # JobManager.instance().run_job(job) os.system(command) # ha ha ha return temporary_dataset_name, analyze_job_name
def add_analyze_process_step(self, dataset_name, analyze_job_name, step: JobStep): step_type = step.type with db.open_session() as s: # 1.1. check dataset exists d = s.query(DatasetEntity).filter( DatasetEntity.name == dataset_name).first() if d is None: raise EntityNotExistsException(DatasetEntity, dataset_name) # 1.2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == analyze_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 2. handle event with db.open_session() as s: # 2.1. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=analyze_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # 2.2. handle analyze event if step_type == AnalyzeStep.Types.Analyzed: # update temporary dataset # todo handle failed analyze if step.status == JobStep.Status.Succeed: hints = step.extension.pop("hints") d_stats = DatasetStats.load_dict(step.extension) features_str = [f.to_dict() for f in d_stats.features] update_fields = \ { "has_header": d_stats.has_header, "extension": step.extension, "n_cols": d_stats.n_cols, "n_rows": d_stats.n_rows, "features": features_str, "hints": hints, "feature_summary": d_stats.feature_summary.to_dict(), "status": DatasetEntity.Status.Analyzed } else: update_fields = {"status": DatasetEntity.Status.Failed} self.dataset_dao.update_by_name(s, dataset_name, update_fields) elif step_type == AnalyzeStep.Types.PatchCorrelation: # 1. check dataset status, only analyzed can calc relativity dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset.status != AnalyzeStep.Types.Analyzed: raise ValueError( f"Dataset {dataset_name} status is not {AnalyzeStep.Types.Analyzed} ." ) request_label_col = step.extension.get("label_col") if request_label_col != dataset.label_col: raise ValueError( f"Dataset {dataset_name} label col is {dataset.label_col} but received result is for {request_label_col}" ) # 2. read extension corr_dict = step.extension.get('corr') # 3. load & update features features = dataset.to_dataset_stats().features for f in features: correlation = corr_dict.get(f.name) f.correlation = FeatureCorrelation( value=correlation, status=FeatureCorrelation.calc_status( correlation, request_label_col == f.name)) # 4. sort features by abs correlation features = sorted(features, key=lambda f: abs(f.correlation.value), reverse=True) feature_dict_list = [] for f in features: feature_dict_list.append(f.to_dict()) # 5. push back database self.dataset_dao.update_by_name( s, dataset_name, {"features": feature_dict_list})
def test_human_date(): print(util.human_datetime(util.get_now_datetime()))