def brevity_dataset_pagination(self, req_dict): # 1. read param page_num = util.require_in_dict(req_dict, 'page_num', int, default=1) page_size = util.require_in_dict(req_dict, 'page_size', int, default=10) query = util.get_from_dict(req_dict, 'query', str) order_by = util.require_in_dict(req_dict, 'order_by', str, default="create_datetime") order = util.require_in_dict(req_dict, 'order', str, default="desc") allow_order_by_fields = [ "create_datetime", "n_experiments", "size", "n_rows", "n_cols" ] if order_by not in allow_order_by_fields: raise ValueError( f"Order by field should in {','.join(allow_order_by_fields)}, but input is: {order_by}" ) allow_order_strategies = ["desc", "asc"] if order not in allow_order_strategies: raise ValueError( f"order strategy should in {','.join(allow_order_strategies)}, but input is: {order}" ) if page_num < 1: raise ValueError("Param page_num should > 1 .") if page_size < 0: raise ValueError("Param page_size should > 0 .") def _handle(model_dao, session, dataset: DatasetEntity): d = util.sqlalchemy_obj_to_dict(dataset) d['file_path'] = util.relative_path(dataset.file_path) d['create_datetime'] = util.to_timestamp(dataset.create_datetime) d['n_experiments'] = model_dao.query_n_experiment( session, dataset.name) del d['features'] del d['feature_summary'] del d['extension'] return d # 2. query with db.open_session() as s: datasets, total = self.dataset_dao.pagination( s, page_num, page_size, query, order_by, order) datasets = [_handle(self.model_dao, s, d) for d in datasets] return datasets, total
def predict(self, dataset_name, model_name, req_dict: dict): # 1. read params file_path = util.require_in_dict(req_dict, 'file_path', str) reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list) upload_took = util.require_in_dict(req_dict, 'upload_took', float) if reserved_cols is None or len(reserved_cols) < 1: reserved_cols_str = "" else: reserved_cols_str = ",".join(reserved_cols) # 2. check params with db.open_session() as s: self.model_dao.require_by_name(s, model_name).to_model_bean() dataset_stats = self.dataset_dao.require_by_name( s, dataset_name).to_dataset_stats() predict_job_name = util.predict_job_name(dataset_name) abs_file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(abs_file_path): raise ValueError(f"Input file not exists: {abs_file_path}") if not P.isfile(abs_file_path): raise ValueError(f"Input file is not file: {abs_file_path}") # 3. add upload step upload_extension = {"file_size": P.getsize(abs_file_path)} upload_step = JobStep(type=PredictStepType.Upload, status=JobStep.Status.Succeed, took=upload_took, datetime=util.get_now_long(), extension=upload_extension) self.add_predict_process_step(model_name, predict_job_name, upload_step) # 4. execute command model_dir = util.model_dir(dataset_name, model_name) predict_log_path = P.join(model_dir, f"{predict_job_name}.log") if not dataset_stats.has_header: default_headers = ",".join( [f.name for f in dataset_stats.features]) else: default_headers = None command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers} --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {predict_log_path}") os.system(command) # ha ha ha return predict_job_name
def add_train_process_step(self, train_job_name, req_dict): # [1]. read & check params step_type = util.require_in_dict(req_dict, 'type', str) step_status = util.require_in_dict(req_dict, 'status', str) step_extension = util.get_from_dict(req_dict, 'extension', dict) if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]: raise ValueError(f"Unknown step type = {step_type}") if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]: raise ValueError(f"Unknown status = {step_status}") # [2]. save message with db.open_session() as s: # [2.1]. check temporary model exists model = self.model_dao.find_by_train_job_name(s, train_job_name) model_name = model.name # [2.2]. check event type, one type one record messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]: raise Exception(f"Event type = {step_type} already exists .") # [2.3]. create a new message content = util.dumps(req_dict) message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # [2.4]. handle analyze event current_progress = model.progress # todo check in code body self._check_progress_change(step_type, current_progress) # add failed status if step_type == TrainStep.Types.Evaluate: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"performance": step_extension['performance']}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.Load: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.OptimizeStart: pass # train_trail_no = step_extension.get('trail_no') # if train_trail_no is None or not isinstance(train_trail_no, int): # raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}") # # upload trail number # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no}) elif step_type == TrainStep.Types.Optimize: train_trail_no = step_extension.get('trail_no') # update trails # load current trail and append new trails = model.trails if model.trails is None: trails = [] trails.append(step_extension) self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails}) elif step_type == TrainStep.Types.Persist: model_file_size = step_extension['model_file_size'] self._update_model(s, model_name, step_type, {"model_file_size": model_file_size, "status": ModelStatusType.Succeed, "finish_datetime": util.get_now_datetime()}) else: self._update_model(s, model_name, step_type, {})
def experiment(self, req_dict: dict): # 1. read params label_col = util.require_in_dict(req_dict, 'label_col', str) pos_label = util.get_from_dict(req_dict, 'pos_label', object) train_mode = util.get_from_dict(req_dict, 'train_mode', str) partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str) dataset_name = util.require_in_dict(req_dict, 'dataset_name', str) holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int) # todo check datetime_series_col datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str) experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str) if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]: raise ValueError(f"Unseen experiment_engine {experiment_engine}") # 2. check partition_strategy cross_validation = None train_validation_holdout = None if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation: cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict) n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int) if 1 < n_folds <= 50: cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage) else: raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}") elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout: train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict) train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int) validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int) if train_percentage + validation_percentage + holdout_percentage != 100: raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.") train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage) else: raise ValueError(f"Unknown partition strategy = {partition_strategy}") # 2. Retrieve data with db.open_session() as s: # 2.1. check dataset dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset is None: raise ValueError(f"Dataset={dataset_name} not exists.") dataset_stats = dataset.to_dataset_stats() # 2.2. generate new experiment name no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1 # 3. ensure dataset label is latest if dataset_stats.label_col is None: log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) if dataset_stats.label_col != label_col: log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}") self._handle_label_col(dataset_name, label_col, dataset_stats.file_path) # 4. calc task type # 4.1. find label label_f = self._find_feature(dataset_stats.features, label_col) if label_f is None: raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .") task_type = self._infer_task_type(label_f) # 4.2. check pos_label if task_type == TaskType.BinaryClassification: if pos_label is None: raise ValueError("Pos label can not be None when it's binary-classify") else: if isinstance(pos_label, str): if len(pos_label) < 1: raise ValueError("Pos label can not be empty when it's binary-classify") # 5. run experiment if not dataset_stats.has_header: dataset_default_headers = [f.name for f in dataset_stats.features] else: dataset_default_headers = None conf = ExperimentConf(dataset_name=dataset_name, dataset_has_header=dataset_stats.has_header, dataset_default_headers=dataset_default_headers, train_mode=train_mode, label_col=label_col, pos_label=pos_label, task_type=task_type, partition_strategy=partition_strategy, cross_validation=cross_validation, train_validation_holdout=train_validation_holdout, datetime_series_col=datetime_series_col, file_path=dataset_stats.file_path) model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features))) if experiment_engine == FrameworkType.GBM: train_conf = self.run_train_job(FrameworkType.GBM, conf, no_experiment, model_input_features, dataset_stats.n_rows) else: train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features, dataset_stats.n_rows) return { "no_experiment": no_experiment, "experiment_conf": conf.to_dict(), "train_job_conf": train_conf }
def create_temporary_dataset(self, req_dict): sample_strategy = util.require_in_dict(req_dict, 'sample_strategy', str, 'random_rows') if SampleConf.Strategy.Percentage == sample_strategy: percentage = util.get_from_dict(req_dict, 'percentage', int, 30) n_rows = None elif SampleConf.Strategy.RandomRows == sample_strategy: n_rows = util.get_from_dict(req_dict, 'n_rows', int, 1000) percentage = None elif SampleConf.Strategy.WholeData == sample_strategy: n_rows = None percentage = None else: raise ValueError(f"Not support sample strategy: {sample_strategy}") upload_took = util.require_in_dict(req_dict, 'upload_took', float) file_path = util.require_in_dict(req_dict, 'file_path', str) source_type = util.require_in_dict(req_dict, 'source_type', str) sample_conf = SampleConf(sample_strategy=sample_strategy, percentage=percentage, n_rows=n_rows) # 1. validate param if source_type not in [ DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import ]: raise IllegalParamException( 'source_type', source_type, f'Should in {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}' ) if source_type == DatasetEntity.SourceType.Upload: upload_file_prefix = P.join(consts.FIELD_TMP, consts.FIELD_UPLOAD) if not file_path.startswith(upload_file_prefix): raise ValueError( f"For upload file should path should start with {upload_file_prefix} but it's {file_path}" ) else: # fix relative path file_path = P.join(consts.DATA_DIR, file_path) if not P.exists(file_path): raise ValueError(f"File={file_path} not exists") if not P.isfile(file_path): raise ValueError(f"File={file_path} is not a file") util.validate_sample_conf(sample_conf) # 2. create if source_type == DatasetEntity.SourceType.Upload: return self._create_temporary_dataset(source_type, file_path, upload_took, sample_conf) elif source_type == DatasetEntity.SourceType.Import: t1 = time.time() internal_path = util.temporary_upload_file_path( P.basename(file_path)) os.makedirs(P.dirname(internal_path), exist_ok=True) shutil.copy(file_path, internal_path) took = time.time() - t1 logger.info(f"Copy file to {internal_path}") return self._create_temporary_dataset(source_type, internal_path, took, sample_conf) else: raise IllegalParamException( 'source_type', source_type, f'should one of {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}' )