def train_model_and_evaluate(self, dataset_name, req_dict): os.environ["ASYNC_TEST_TIMEOUT"] = "10" portal = self.get_url("") consts.SERVER_PORTAL = portal # 1. create a train job self.assert_response_and_get( self.fetch( f'/api/dataset/{dataset_name}/feature-series/default/train-job', method="POST", body=util.dumps(req_dict), headers=self.DEFAULT_HEADER)) # 2. poll train status max_times = 30 for i in range(max_times): time.sleep(1) resp_poll = self.assert_response_and_get( self.fetch( f'/api/dataset/{dataset_name}/feature-series/default/train-job', method="GET", headers=self.DEFAULT_HEADER)) print("resp_poll") print(util.dumps(resp_poll)) trainings = resp_poll['trainings'] assert len(trainings) == 1 experiment_dict = trainings[0] assert experiment_dict['no_experiment'] == 1 models_dict_list = experiment_dict['models'] assert len(models_dict_list) > 0 model_dict = models_dict_list[0] assert 'name' in model_dict assert 'status' in model_dict assert 'escaped' in model_dict assert 'model_file_size' in model_dict assert 'log_file_path' in model_dict status = model_dict.get('status') model_name = model_dict.get('name') final_status = ["failed", "succeed"] if status in final_status: assert status == "succeed" return model_name if i == max_times - 1: raise Exception("Train timeout.")
def callback(url, type, status, took, extension, **kwargs): req_body_dict = \ { "type": type, "status": status, "took": took, "datetime": util.get_now_long(), "extension": extension } req_body = util.dumps(req_body_dict) logger.info(f"Send process event: \n{url}\n{req_body}") # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1 response = requests.post(url, data=req_body.encode('utf-8'), timeout=TIMEOUT, headers=HEADERS) _checkout_response_json(response)
def create_dataset_from_file(self, file_path): # 1. create temporary dataset temporary_dataset_name = self.create_temporary_dataset_from_file( file_path) # 2. create dataset and validate body = \ { "dataset_name": temporary_dataset_name, "temporary_dataset_name": temporary_dataset_name } create_response = self.fetch('/api/dataset', method="POST", body=util.dumps(body), headers=self.DEFAULT_HEADER) self.assert_response_and_get(create_response) return temporary_dataset_name
def create_dataset(self): # 1. create temporary dataset temporary_dataset_name = self.create_temporary_dataset() # 2. create dataset and validate dataset_name = f"diabetes_{util.short_uuid()}" body = \ { "dataset_name": dataset_name, "temporary_dataset_name": temporary_dataset_name } create_response = self.fetch('/api/dataset', method="POST", body=util.dumps(body), headers=self.DEFAULT_HEADER) self.assert_response_and_get(create_response) return dataset_name
def add_predict_process_step(self, model_name: str, job_name: str, step: JobStep): step_type = step.type with db.open_session() as s: # 1. check temporary model exists model = self.model_dao.require_by_name(s, model_name) # 2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 3. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message)
def to_json(self): result = {"code": self.code, "data": self.data} return util.dumps(result)
def add_train_process_step(self, train_job_name, req_dict): # [1]. read & check params step_type = util.require_in_dict(req_dict, 'type', str) step_status = util.require_in_dict(req_dict, 'status', str) step_extension = util.get_from_dict(req_dict, 'extension', dict) if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]: raise ValueError(f"Unknown step type = {step_type}") if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]: raise ValueError(f"Unknown status = {step_status}") # [2]. save message with db.open_session() as s: # [2.1]. check temporary model exists model = self.model_dao.find_by_train_job_name(s, train_job_name) model_name = model.name # [2.2]. check event type, one type one record messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]: raise Exception(f"Event type = {step_type} already exists .") # [2.3]. create a new message content = util.dumps(req_dict) message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # [2.4]. handle analyze event current_progress = model.progress # todo check in code body self._check_progress_change(step_type, current_progress) # add failed status if step_type == TrainStep.Types.Evaluate: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"performance": step_extension['performance']}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.Load: if step_status == JobStep.Status.Succeed: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running}) else: self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()}) elif step_type == TrainStep.Types.OptimizeStart: pass # train_trail_no = step_extension.get('trail_no') # if train_trail_no is None or not isinstance(train_trail_no, int): # raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}") # # upload trail number # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no}) elif step_type == TrainStep.Types.Optimize: train_trail_no = step_extension.get('trail_no') # update trails # load current trail and append new trails = model.trails if model.trails is None: trails = [] trails.append(step_extension) self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails}) elif step_type == TrainStep.Types.Persist: model_file_size = step_extension['model_file_size'] self._update_model(s, model_name, step_type, {"model_file_size": model_file_size, "status": ModelStatusType.Succeed, "finish_datetime": util.get_now_datetime()}) else: self._update_model(s, model_name, step_type, {})
def generate_code(self, model_name, model_input_features, n_rows, train_job_conf: TrainJobConf, experiment_conf: ExperimentConf): # 1. set earlystopping if experiment_conf.train_mode == TrainMode.Minimal: earlystopping_patience = "[1]" else: if n_rows <= 1000: earlystopping_patience = "[10, 50, 100]" elif 1000 < n_rows <= 10000 : earlystopping_patience = "[5, 10, 15]" else: earlystopping_patience = "[1, 3, 5]" # 2. set default header if has no header if not experiment_conf.dataset_has_header: if experiment_conf.dataset_default_headers is not None: dataset_default_headers_code = util.dumps(experiment_conf.dataset_default_headers, indent=None) else: raise ValueError("When dataset_has_header is False then param dataset_default_headers can be None.") else: dataset_default_headers_code = None # 3. make reader params pos_label = experiment_conf.pos_label pos_label_is_str = isinstance(pos_label, str) reward_metric = self.get_optimize_metric(experiment_conf.task_type, train_job_conf.framework) params_dict = { "server_portal": consts.SERVER_PORTAL, "train_job_name": train_job_conf.name, "data_root": consts.DATA_DIR, "model_name": model_name, "pos_label": pos_label, "earlystopping_patience": earlystopping_patience, "pos_label_is_str": pos_label_is_str, "train_file_path": experiment_conf.file_path, "test_file_path": experiment_conf.test_file_path, "task_type": experiment_conf.task_type, "gbm_task_type": experiment_conf.task_type, "dataset_name": experiment_conf.dataset_name, "label_col": experiment_conf.label_col, "pos_label_value": experiment_conf.pos_label, "train_mode": experiment_conf.train_mode, "partition_strategy": experiment_conf.partition_strategy, "datetime_series_col": experiment_conf.datetime_series_col, "reward_metric": reward_metric, "optimize_direction": self.get_direction_source_code(reward_metric), "framework": train_job_conf.framework, "max_trails": train_job_conf.max_trails, "dataset_has_header": experiment_conf.dataset_has_header, "dataset_default_headers": dataset_default_headers_code, "model_feature_list": util.dumps(model_input_features, indent=None) } if experiment_conf.partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout: params_dict['holdout_percentage'] = experiment_conf.train_validation_holdout.holdout_percentage params_dict["train_percentage"] = experiment_conf.train_validation_holdout.train_percentage params_dict["validation_percentage"] = experiment_conf.train_validation_holdout.validation_percentage else: params_dict['holdout_percentage'] = experiment_conf.cross_validation.holdout_percentage params_dict["n_folds"] = experiment_conf.cross_validation.n_folds # 4. render raw python template_dir = P.join(consts.PATH_INSTALL_HOME, 'cooka', 'core', 'train_template') train_template_file = P.join(template_dir, 'target_raw_python.jinja2') with open(train_template_file, 'r') as f: raw_python_content = Environment(loader=FileSystemLoader(template_dir)).from_string(f.read()).render(params_dict) # 5. render notebook import copy params_dict_notebook = copy.deepcopy(params_dict) params_dict_notebook['target_source_type'] = 'notebook' def render_file_for_nb(name, comment=None): file_content = util.read_text(P.join(template_dir, name)) c = Environment(loader=FileSystemLoader(template_dir)).from_string(file_content).render(params_dict_notebook) cell_code = ExperimentService.generate_notebook_cell(c, 'code') if comment is not None: cell_comment = ExperimentService.generate_notebook_cell(comment, 'markdown') else: cell_comment = None return cell_code, cell_comment task_type_name_dict = { TaskType.MultiClassification: "Multi Classification", TaskType.BinaryClassification: "Binary Classification", TaskType.Regression: "Regression" } framework_github_dict = { FrameworkType.DeepTables: "[DeepTables](https://github.com/DataCanvasIO/DeepTables)", FrameworkType.GBM: "[HyperGBM](https://github.com/DataCanvasIO/HyperGBM)" , } example_doc = f"""# Training {experiment_conf.label_col} in {experiment_conf.dataset_name} {task_type_name_dict[experiment_conf.task_type]} model by {framework_github_dict.get(train_job_conf.framework)}, Generated on {util.human_std_datetime()}""" cell_example_doc = ExperimentService.generate_notebook_cell(example_doc, 'markdown') cell_train_header, cell_train_header_comment = render_file_for_nb('train_header.jinja2') cell_train_config, cell_train_config_comment = render_file_for_nb('train_config.jinja2', "## [1]. train config") cell_train_data_partition, cell_train_data_partition_comment = render_file_for_nb('train_data_partition.jinja2', "## [2]. data partition") cell_train_search, cell_train_search_comment = render_file_for_nb('train_search.jinja2', "## [3]. search best params") # cell_train_final_train, cell_train_final_train_comment = render_file_for_nb('train_final_train.jinja2', "## [4]. final train") cell_train_evaluate, cell_train_evaluate_comment = render_file_for_nb('train_evaluate.jinja2', "## [4]. evaluate") cells = [cell_example_doc, cell_train_header, cell_train_config_comment, cell_train_config, cell_train_data_partition_comment, cell_train_data_partition, cell_train_search_comment, cell_train_search, cell_train_evaluate_comment, cell_train_evaluate] if experiment_conf.task_type == TaskType.BinaryClassification: cell_evaluate_confusion_matrix, cell_evaluate_confusion_matrix_comment = render_file_for_nb('plot_confusion_matrix.jinja2', "## 5. Plot confusion matrix") cell_evaluate_plot_roc_curve, cell_evaluate_plot_roc_curve_comment = render_file_for_nb('plot_roc_curve.jinja2', "## 6. Plot roc curve") cells.append(cell_evaluate_confusion_matrix_comment) cells.append(cell_evaluate_confusion_matrix) cells.append(cell_evaluate_plot_roc_curve_comment) cells.append(cell_evaluate_plot_roc_curve) if train_job_conf.framework == FrameworkType.GBM: cell_plot_feature_importance, plot_feature_importance_comment = render_file_for_nb('plot_feature_importance.jinja2', "## Plot feature importance") cells.append(plot_feature_importance_comment) cells.append(cell_plot_feature_importance) if train_job_conf.framework == FrameworkType.DeepTables: content_dt_explainer = \ """dt_explainer = DeepTablesExplainer(estimator, X_test, num_samples=100) shap_values = dt_explainer.get_shap_values(X_test[:1], nsamples='auto')""" cell_dt_explaine = ExperimentService.generate_notebook_cell(content_dt_explainer) cells.append(cell_dt_explaine) feature_importances = """shap.summary_plot(shap_values,X_test, plot_type="bar")""" cells.append(ExperimentService.generate_notebook_cell(feature_importances)) prediction_explainer = """shap.decision_plot(dt_explainer.explainer.expected_value, shap_values[0], X_test.iloc[0])""" cells.append(ExperimentService.generate_notebook_cell(prediction_explainer)) notebook_dict = ExperimentService.generate_notebook(cells) notebook_content = util.dumps(notebook_dict) return raw_python_content, notebook_content
def _create_temporary_dataset(self, source_type, file_path, took, sample_conf: SampleConf): now = util.get_now_datetime() file_name = P.basename(file_path) temporary_dataset_name = self.choose_temporary_dataset_name( file_name) # use a long name analyze_job_name = util.analyze_data_job_name( util.cut_suffix(file_name), now) file_size = P.getsize(file_path) # 2. create record td = DatasetEntity(name=temporary_dataset_name, file_size=file_size, is_temporary=True, status=DatasetEntity.Status.Created, source_type=source_type, file_path=file_path, file_name=file_name, create_datetime=now, last_update_datetime=now) with db.open_session() as s: s.add(td) # 3. send file transfer step if source_type == DatasetEntity.SourceType.Upload: step = JobStep(type=AnalyzeStep.Types.Upload, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) elif source_type == DatasetEntity.SourceType.Import: step = JobStep(type=AnalyzeStep.Types.Copy, status=AnalyzeStep.Status.Succeed, extension={ "file_size": file_size, "file_path": file_path }, took=took, datetime=util.get_now_long()) self.add_analyze_process_step(temporary_dataset_name, analyze_job_name, step) # 4. create analyze config conf = AnalyzeJobConf(job_name=analyze_job_name, dataset_name=temporary_dataset_name, sample_conf=sample_conf, path=file_path, temporary_dataset=True, label_col=None) # 5. start new process analyze_config_string = util.dumps(conf.to_dict()) logger.info(f"Analyze job conf: {analyze_config_string}") python_executable = sys.executable temporary_dataset_dir = util.temporary_dataset_dir( temporary_dataset_name) os.makedirs(temporary_dataset_dir, exist_ok=True) std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log") command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &" logger.info(f"Run analyze job command: \n{command}") logger.info(f"Log file:\ntail -f {std_log}") # JobManager.instance().run_job(job) os.system(command) # ha ha ha return temporary_dataset_name, analyze_job_name
def add_analyze_process_step(self, dataset_name, analyze_job_name, step: JobStep): step_type = step.type with db.open_session() as s: # 1.1. check dataset exists d = s.query(DatasetEntity).filter( DatasetEntity.name == dataset_name).first() if d is None: raise EntityNotExistsException(DatasetEntity, dataset_name) # 1.2. check event type, one type one record messages = s.query(MessageEntity).filter( MessageEntity.author == analyze_job_name).all() for m in messages: if step_type == util.loads(m.content).get('type'): raise Exception( f"Event type = {step_type} already exists .") # 2. handle event with db.open_session() as s: # 2.1. create a new message content = util.dumps(step.to_dict()) message = MessageEntity(id=util.short_uuid(), author=analyze_job_name, content=content, create_datetime=util.get_now_datetime()) s.add(message) # 2.2. handle analyze event if step_type == AnalyzeStep.Types.Analyzed: # update temporary dataset # todo handle failed analyze if step.status == JobStep.Status.Succeed: hints = step.extension.pop("hints") d_stats = DatasetStats.load_dict(step.extension) features_str = [f.to_dict() for f in d_stats.features] update_fields = \ { "has_header": d_stats.has_header, "extension": step.extension, "n_cols": d_stats.n_cols, "n_rows": d_stats.n_rows, "features": features_str, "hints": hints, "feature_summary": d_stats.feature_summary.to_dict(), "status": DatasetEntity.Status.Analyzed } else: update_fields = {"status": DatasetEntity.Status.Failed} self.dataset_dao.update_by_name(s, dataset_name, update_fields) elif step_type == AnalyzeStep.Types.PatchCorrelation: # 1. check dataset status, only analyzed can calc relativity dataset = self.dataset_dao.require_by_name(s, dataset_name) if dataset.status != AnalyzeStep.Types.Analyzed: raise ValueError( f"Dataset {dataset_name} status is not {AnalyzeStep.Types.Analyzed} ." ) request_label_col = step.extension.get("label_col") if request_label_col != dataset.label_col: raise ValueError( f"Dataset {dataset_name} label col is {dataset.label_col} but received result is for {request_label_col}" ) # 2. read extension corr_dict = step.extension.get('corr') # 3. load & update features features = dataset.to_dataset_stats().features for f in features: correlation = corr_dict.get(f.name) f.correlation = FeatureCorrelation( value=correlation, status=FeatureCorrelation.calc_status( correlation, request_label_col == f.name)) # 4. sort features by abs correlation features = sorted(features, key=lambda f: abs(f.correlation.value), reverse=True) feature_dict_list = [] for f in features: feature_dict_list.append(f.to_dict()) # 5. push back database self.dataset_dao.update_by_name( s, dataset_name, {"features": feature_dict_list})
def test_binary_classification_model(self): # 1. create a dataset dataset_name = self.create_dataset() # 2. create a train job req_dict = \ { "label_col": "readmitted", "pos_label": True, "train_mode": "minimal", "partition_strategy": "cross_validation", "holdout_percentage": 20, "cross_validation": { "n_folds": 2 }, "datetime_series_col": None } model_name = self.train_model_and_evaluate(dataset_name, req_dict) # 1. check model exists model_path = util.model_dir(dataset_name, model_name) assert P.exists(model_path) # 2. predict file req_dict = \ { "upload_took": 10, # 10 just for test "file_path": P.abspath('cooka/test/dataset/diabetes_10k_datetime.csv'), "reserved_cols": ['race'], } resp_batch_predict_job = self.assert_response_and_get( self.fetch( f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}/batch-predict-job', method="POST", body=util.dumps(req_dict), headers=self.DEFAULT_HEADER)) assert 'batch_predict_job_name' in resp_batch_predict_job batch_predict_job_name = resp_batch_predict_job.get( 'batch_predict_job_name') # 3. poll batch job status max_poll_times = 31 # try 30 times for i in range(max_poll_times): time.sleep(1) resp_poll_batch_predict_job = self.assert_response_and_get( self.fetch( f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}/batch-predict-job/{batch_predict_job_name}', method="GET", headers=self.DEFAULT_HEADER)) # stop if i == max_poll_times - 1: raise Exception(util.dumps(resp_poll_batch_predict_job)) assert "batch_predict_job_name" in resp_poll_batch_predict_job assert "steps" in resp_poll_batch_predict_job print(resp_poll_batch_predict_job) batch_predict_steps = resp_poll_batch_predict_job.get('steps') if self.check_batch_predict_steps_finished(batch_predict_steps): break # 4. check model path correct resp_model = self.assert_response_and_get( self.fetch( f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}', method="GET", headers=self.DEFAULT_HEADER)) # 5. check response assert "name" in resp_model assert "status" in resp_model assert "escaped" in resp_model assert "model_file_size" in resp_model assert "task_type" in resp_model assert "framework" in resp_model assert "create_datetime" in resp_model assert "last_update_datetime" in resp_model assert "log_file_path" in resp_model assert "performance" in resp_model # todo assert "trails" in resp_model performance = resp_model.get('performance') assert "metrics" in performance assert "confusion_matrix" in performance # todo assert "roc_curve" in performance metrics = performance.get('metrics') assert 'roc_auc' in metrics # todo assert 'f1' in metrics confusion_matrix = performance.get('confusion_matrix') assert "fn" in confusion_matrix assert "fp" in confusion_matrix assert "tn" in confusion_matrix assert "tp" in confusion_matrix assert "label" in confusion_matrix label_dict = confusion_matrix.get('label')
def create_temporary_dataset_from_file(self, data_path): """This can not in setup_class(cls) nor setUpClass(cls), setUp, because http server not ready. """ super(WithTemporaryDatasetTestCase, self).setUp() # must invoke or not create http server from cooka.common import consts consts.SERVER_PORTAL = self.get_url('') # use temporary server # 1. upload boundary = uuid.uuid4().hex headers = { "Content-Type": "multipart/form-data; boundary=%s" % boundary } producer = partial(self.multipart_producer, boundary, data_path) upload_response = self.fetch(path='/api/resource', method="POST", body_producer=producer, headers=headers) upload_response_body = self.assert_response_and_get(upload_response) upload_file_path = upload_response_body.get('path') upload_took = upload_response_body.get('took') assert upload_file_path is not None assert upload_took is not None # 2. send request body = { "sample_strategy": "random_rows", "percentage": 30, "n_rows": 1000, "file_path": upload_file_path, "upload_took": upload_took, "source_type": "upload", } str_body = util.dumps(body) # 3. validate code create_response_body = self.assert_response_and_get( self.fetch(path='/api/temporary-dataset', method="POST", body=str_body, headers=headers)) print(f"create response body:\n {create_response_body}") # 3. poll dataset message temporary_dataset_name = create_response_body["temporary_dataset_name"] analyze_job_name = create_response_body["analyze_job_name"] excepted_event = [ AnalyzeStep.Types.Upload, AnalyzeStep.Types.Load, AnalyzeStep.Types.Analyzed ] analyze_passed = False poll_job_response_body = None for i in range(10): # poll for 30 times every time one second poll_job_response = self.fetch( f'/api/dataset/{temporary_dataset_name}/analyze-job/{analyze_job_name}', method="GET") poll_job_response_body = self.assert_response_and_get( poll_job_response) events = poll_job_response_body['steps'] events.sort(key=lambda x: x['datetime']) events_type = [event["type"] for event in events] if excepted_event == events_type: # has all excepted type order by datetime # validate every analyze_passed = True break if AnalyzeStep.Types.End in events_type: break time.sleep(1) assert analyze_passed, f"{poll_job_response_body}" # 4. retrieve dataset and check detail with db.open_session() as s: temporary_dataset = s.query(DatasetEntity).filter( DatasetEntity.name == temporary_dataset_name).first() assert temporary_dataset is not None, f'Temporary dataset = {temporary_dataset_name} create failed' assert len(temporary_dataset.extension) > 0 assert temporary_dataset.status == DatasetEntity.Status.Analyzed return temporary_dataset_name