Exemple #1
0
    def train_model_and_evaluate(self, dataset_name, req_dict):
        os.environ["ASYNC_TEST_TIMEOUT"] = "10"
        portal = self.get_url("")
        consts.SERVER_PORTAL = portal

        # 1. create a train job
        self.assert_response_and_get(
            self.fetch(
                f'/api/dataset/{dataset_name}/feature-series/default/train-job',
                method="POST",
                body=util.dumps(req_dict),
                headers=self.DEFAULT_HEADER))

        # 2.  poll train status
        max_times = 30
        for i in range(max_times):
            time.sleep(1)
            resp_poll = self.assert_response_and_get(
                self.fetch(
                    f'/api/dataset/{dataset_name}/feature-series/default/train-job',
                    method="GET",
                    headers=self.DEFAULT_HEADER))
            print("resp_poll")
            print(util.dumps(resp_poll))
            trainings = resp_poll['trainings']
            assert len(trainings) == 1
            experiment_dict = trainings[0]
            assert experiment_dict['no_experiment'] == 1

            models_dict_list = experiment_dict['models']
            assert len(models_dict_list) > 0
            model_dict = models_dict_list[0]

            assert 'name' in model_dict
            assert 'status' in model_dict
            assert 'escaped' in model_dict
            assert 'model_file_size' in model_dict
            assert 'log_file_path' in model_dict

            status = model_dict.get('status')
            model_name = model_dict.get('name')

            final_status = ["failed", "succeed"]
            if status in final_status:
                assert status == "succeed"
                return model_name

            if i == max_times - 1:
                raise Exception("Train timeout.")
Exemple #2
0
def callback(url, type, status, took, extension, **kwargs):
    req_body_dict = \
        {
            "type": type,
            "status": status,
            "took": took,
            "datetime": util.get_now_long(),
            "extension": extension
        }

    req_body = util.dumps(req_body_dict)
    logger.info(f"Send process event: \n{url}\n{req_body}")
    # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1
    response = requests.post(url,
                             data=req_body.encode('utf-8'),
                             timeout=TIMEOUT,
                             headers=HEADERS)
    _checkout_response_json(response)
Exemple #3
0
    def create_dataset_from_file(self, file_path):
        # 1. create temporary dataset
        temporary_dataset_name = self.create_temporary_dataset_from_file(
            file_path)

        # 2. create dataset and validate
        body = \
            {
                "dataset_name": temporary_dataset_name,
                "temporary_dataset_name": temporary_dataset_name
            }

        create_response = self.fetch('/api/dataset',
                                     method="POST",
                                     body=util.dumps(body),
                                     headers=self.DEFAULT_HEADER)
        self.assert_response_and_get(create_response)
        return temporary_dataset_name
Exemple #4
0
    def create_dataset(self):
        # 1. create temporary dataset
        temporary_dataset_name = self.create_temporary_dataset()

        # 2. create dataset and validate
        dataset_name = f"diabetes_{util.short_uuid()}"
        body = \
            {
                "dataset_name": dataset_name,
                "temporary_dataset_name": temporary_dataset_name
            }

        create_response = self.fetch('/api/dataset',
                                     method="POST",
                                     body=util.dumps(body),
                                     headers=self.DEFAULT_HEADER)
        self.assert_response_and_get(create_response)
        return dataset_name
Exemple #5
0
    def add_predict_process_step(self, model_name: str, job_name: str,
                                 step: JobStep):
        step_type = step.type
        with db.open_session() as s:
            # 1.  check temporary model exists
            model = self.model_dao.require_by_name(s, model_name)

            # 2. check event type, one type one record
            messages = s.query(MessageEntity).filter(
                MessageEntity.author == job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    raise Exception(
                        f"Event type = {step_type} already exists .")

            # 3. create a new message
            content = util.dumps(step.to_dict())
            message = MessageEntity(id=util.short_uuid(),
                                    author=job_name,
                                    content=content,
                                    create_datetime=util.get_now_datetime())
            s.add(message)
Exemple #6
0
 def to_json(self):
     result = {"code": self.code, "data": self.data}
     return util.dumps(result)
    def add_train_process_step(self, train_job_name, req_dict):
        # [1]. read & check params
        step_type = util.require_in_dict(req_dict, 'type', str)
        step_status = util.require_in_dict(req_dict, 'status', str)
        step_extension = util.get_from_dict(req_dict, 'extension', dict)

        if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]:
            raise ValueError(f"Unknown step type = {step_type}")

        if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]:
            raise ValueError(f"Unknown status = {step_status}")

        # [2]. save message
        with db.open_session() as s:
            # [2.1].  check temporary model exists
            model = self.model_dao.find_by_train_job_name(s, train_job_name)
            model_name = model.name
            # [2.2]. check event type, one type one record
            messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]:
                        raise Exception(f"Event type = {step_type} already exists .")

            # [2.3]. create a new message
            content = util.dumps(req_dict)
            message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime())
            s.add(message)

            # [2.4]. handle analyze event
            current_progress = model.progress
            # todo check in code body self._check_progress_change(step_type, current_progress)  # add failed status
            if step_type == TrainStep.Types.Evaluate:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"performance": step_extension['performance']})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.Load:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.OptimizeStart:
                pass
                # train_trail_no = step_extension.get('trail_no')
                # if train_trail_no is None or not isinstance(train_trail_no, int):
                #     raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}")
                # # upload trail number
                # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no})

            elif step_type == TrainStep.Types.Optimize:
                train_trail_no = step_extension.get('trail_no')
                # update trails
                # load current trail and append new
                trails = model.trails
                if model.trails is None:
                    trails = []
                trails.append(step_extension)
                self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails})

            elif step_type == TrainStep.Types.Persist:
                model_file_size = step_extension['model_file_size']
                self._update_model(s, model_name, step_type, {"model_file_size": model_file_size,
                                                              "status": ModelStatusType.Succeed,
                                                              "finish_datetime": util.get_now_datetime()})
            else:
                self._update_model(s, model_name, step_type, {})
    def generate_code(self, model_name, model_input_features, n_rows, train_job_conf: TrainJobConf, experiment_conf: ExperimentConf):
        # 1. set earlystopping
        if experiment_conf.train_mode == TrainMode.Minimal:
            earlystopping_patience = "[1]"
        else:
            if n_rows <= 1000:
                earlystopping_patience = "[10, 50, 100]"
            elif 1000 < n_rows <= 10000 :
                earlystopping_patience = "[5, 10, 15]"
            else:
                earlystopping_patience = "[1, 3, 5]"

        # 2. set default header if has no header
        if not experiment_conf.dataset_has_header:
            if experiment_conf.dataset_default_headers is not None:
                dataset_default_headers_code = util.dumps(experiment_conf.dataset_default_headers, indent=None)
            else:
                raise ValueError("When dataset_has_header is False then param dataset_default_headers can be None.")
        else:
            dataset_default_headers_code = None

        # 3. make reader params
        pos_label = experiment_conf.pos_label
        pos_label_is_str = isinstance(pos_label, str)
        reward_metric = self.get_optimize_metric(experiment_conf.task_type, train_job_conf.framework)
        params_dict = {
            "server_portal": consts.SERVER_PORTAL,
            "train_job_name": train_job_conf.name,
            "data_root": consts.DATA_DIR,
            "model_name": model_name,
            "pos_label": pos_label,
            "earlystopping_patience": earlystopping_patience,
            "pos_label_is_str": pos_label_is_str,
            "train_file_path": experiment_conf.file_path,
            "test_file_path": experiment_conf.test_file_path,
            "task_type": experiment_conf.task_type,
            "gbm_task_type": experiment_conf.task_type,
            "dataset_name": experiment_conf.dataset_name,
            "label_col": experiment_conf.label_col,
            "pos_label_value": experiment_conf.pos_label,
            "train_mode": experiment_conf.train_mode,
            "partition_strategy": experiment_conf.partition_strategy,
            "datetime_series_col": experiment_conf.datetime_series_col,
            "reward_metric": reward_metric,
            "optimize_direction": self.get_direction_source_code(reward_metric),
            "framework": train_job_conf.framework,
            "max_trails": train_job_conf.max_trails,
            "dataset_has_header": experiment_conf.dataset_has_header,
            "dataset_default_headers": dataset_default_headers_code,
            "model_feature_list": util.dumps(model_input_features, indent=None)
        }

        if experiment_conf.partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout:
            params_dict['holdout_percentage'] = experiment_conf.train_validation_holdout.holdout_percentage
            params_dict["train_percentage"] = experiment_conf.train_validation_holdout.train_percentage
            params_dict["validation_percentage"] = experiment_conf.train_validation_holdout.validation_percentage
        else:
            params_dict['holdout_percentage'] = experiment_conf.cross_validation.holdout_percentage
            params_dict["n_folds"] = experiment_conf.cross_validation.n_folds

        # 4. render raw python
        template_dir = P.join(consts.PATH_INSTALL_HOME, 'cooka', 'core', 'train_template')
        train_template_file = P.join(template_dir, 'target_raw_python.jinja2')
        with open(train_template_file, 'r') as f:
            raw_python_content = Environment(loader=FileSystemLoader(template_dir)).from_string(f.read()).render(params_dict)

        # 5. render notebook
        import copy
        params_dict_notebook = copy.deepcopy(params_dict)
        params_dict_notebook['target_source_type'] = 'notebook'

        def render_file_for_nb(name, comment=None):
            file_content = util.read_text(P.join(template_dir, name))
            c = Environment(loader=FileSystemLoader(template_dir)).from_string(file_content).render(params_dict_notebook)
            cell_code = ExperimentService.generate_notebook_cell(c, 'code')
            if comment is not None:
                cell_comment = ExperimentService.generate_notebook_cell(comment, 'markdown')
            else:
                cell_comment = None
            return cell_code, cell_comment


        task_type_name_dict = {
            TaskType.MultiClassification: "Multi Classification",
            TaskType.BinaryClassification: "Binary Classification",
            TaskType.Regression: "Regression"
        }
        framework_github_dict = {
            FrameworkType.DeepTables: "[DeepTables](https://github.com/DataCanvasIO/DeepTables)",
            FrameworkType.GBM: "[HyperGBM](https://github.com/DataCanvasIO/HyperGBM)" ,
        }

        example_doc = f"""# Training {experiment_conf.label_col} in {experiment_conf.dataset_name}
{task_type_name_dict[experiment_conf.task_type]} model by {framework_github_dict.get(train_job_conf.framework)}, Generated on {util.human_std_datetime()}"""

        cell_example_doc = ExperimentService.generate_notebook_cell(example_doc, 'markdown')
        cell_train_header, cell_train_header_comment = render_file_for_nb('train_header.jinja2')
        cell_train_config, cell_train_config_comment = render_file_for_nb('train_config.jinja2', "## [1]. train config")
        cell_train_data_partition, cell_train_data_partition_comment = render_file_for_nb('train_data_partition.jinja2', "## [2]. data partition")
        cell_train_search, cell_train_search_comment = render_file_for_nb('train_search.jinja2', "## [3]. search best params")
        # cell_train_final_train, cell_train_final_train_comment = render_file_for_nb('train_final_train.jinja2', "## [4]. final train")
        cell_train_evaluate, cell_train_evaluate_comment = render_file_for_nb('train_evaluate.jinja2', "## [4]. evaluate")

        cells = [cell_example_doc, cell_train_header, cell_train_config_comment, cell_train_config, cell_train_data_partition_comment, cell_train_data_partition, cell_train_search_comment, cell_train_search, cell_train_evaluate_comment, cell_train_evaluate]

        if experiment_conf.task_type == TaskType.BinaryClassification:
            cell_evaluate_confusion_matrix, cell_evaluate_confusion_matrix_comment = render_file_for_nb('plot_confusion_matrix.jinja2', "## 5. Plot confusion matrix")
            cell_evaluate_plot_roc_curve, cell_evaluate_plot_roc_curve_comment = render_file_for_nb('plot_roc_curve.jinja2', "## 6. Plot roc curve")
            cells.append(cell_evaluate_confusion_matrix_comment)
            cells.append(cell_evaluate_confusion_matrix)
            cells.append(cell_evaluate_plot_roc_curve_comment)
            cells.append(cell_evaluate_plot_roc_curve)

        if train_job_conf.framework == FrameworkType.GBM:
            cell_plot_feature_importance, plot_feature_importance_comment = render_file_for_nb('plot_feature_importance.jinja2', "## Plot feature importance")
            cells.append(plot_feature_importance_comment)
            cells.append(cell_plot_feature_importance)

        if train_job_conf.framework == FrameworkType.DeepTables:
            content_dt_explainer = \
            """dt_explainer = DeepTablesExplainer(estimator, X_test, num_samples=100)
shap_values = dt_explainer.get_shap_values(X_test[:1], nsamples='auto')"""
            cell_dt_explaine = ExperimentService.generate_notebook_cell(content_dt_explainer)
            cells.append(cell_dt_explaine)

            feature_importances = """shap.summary_plot(shap_values,X_test, plot_type="bar")"""
            cells.append(ExperimentService.generate_notebook_cell(feature_importances))

            prediction_explainer = """shap.decision_plot(dt_explainer.explainer.expected_value, shap_values[0], X_test.iloc[0])"""
            cells.append(ExperimentService.generate_notebook_cell(prediction_explainer))

        notebook_dict = ExperimentService.generate_notebook(cells)
        notebook_content = util.dumps(notebook_dict)

        return raw_python_content, notebook_content
Exemple #9
0
    def _create_temporary_dataset(self, source_type, file_path, took,
                                  sample_conf: SampleConf):
        now = util.get_now_datetime()
        file_name = P.basename(file_path)
        temporary_dataset_name = self.choose_temporary_dataset_name(
            file_name)  # use a long name
        analyze_job_name = util.analyze_data_job_name(
            util.cut_suffix(file_name), now)
        file_size = P.getsize(file_path)

        # 2. create record
        td = DatasetEntity(name=temporary_dataset_name,
                           file_size=file_size,
                           is_temporary=True,
                           status=DatasetEntity.Status.Created,
                           source_type=source_type,
                           file_path=file_path,
                           file_name=file_name,
                           create_datetime=now,
                           last_update_datetime=now)
        with db.open_session() as s:
            s.add(td)

        # 3. send  file transfer step
        if source_type == DatasetEntity.SourceType.Upload:
            step = JobStep(type=AnalyzeStep.Types.Upload,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)
        elif source_type == DatasetEntity.SourceType.Import:
            step = JobStep(type=AnalyzeStep.Types.Copy,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)

        # 4. create analyze config
        conf = AnalyzeJobConf(job_name=analyze_job_name,
                              dataset_name=temporary_dataset_name,
                              sample_conf=sample_conf,
                              path=file_path,
                              temporary_dataset=True,
                              label_col=None)

        # 5. start new process
        analyze_config_string = util.dumps(conf.to_dict())
        logger.info(f"Analyze job conf: {analyze_config_string}")

        python_executable = sys.executable

        temporary_dataset_dir = util.temporary_dataset_dir(
            temporary_dataset_name)

        os.makedirs(temporary_dataset_dir, exist_ok=True)

        std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log")

        command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {std_log}")

        # JobManager.instance().run_job(job)
        os.system(command)  # ha ha ha

        return temporary_dataset_name, analyze_job_name
Exemple #10
0
    def add_analyze_process_step(self, dataset_name, analyze_job_name,
                                 step: JobStep):
        step_type = step.type
        with db.open_session() as s:
            # 1.1.  check dataset exists
            d = s.query(DatasetEntity).filter(
                DatasetEntity.name == dataset_name).first()
            if d is None:
                raise EntityNotExistsException(DatasetEntity, dataset_name)

            # 1.2. check event type, one type one record
            messages = s.query(MessageEntity).filter(
                MessageEntity.author == analyze_job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    raise Exception(
                        f"Event type = {step_type} already exists .")

        # 2. handle event
        with db.open_session() as s:
            # 2.1. create a new message
            content = util.dumps(step.to_dict())
            message = MessageEntity(id=util.short_uuid(),
                                    author=analyze_job_name,
                                    content=content,
                                    create_datetime=util.get_now_datetime())
            s.add(message)

            # 2.2. handle analyze event
            if step_type == AnalyzeStep.Types.Analyzed:
                # update temporary dataset
                # todo handle failed analyze
                if step.status == JobStep.Status.Succeed:
                    hints = step.extension.pop("hints")
                    d_stats = DatasetStats.load_dict(step.extension)

                    features_str = [f.to_dict() for f in d_stats.features]
                    update_fields = \
                        {
                            "has_header": d_stats.has_header,
                            "extension": step.extension,
                            "n_cols": d_stats.n_cols,
                            "n_rows": d_stats.n_rows,
                            "features": features_str,
                            "hints": hints,
                            "feature_summary": d_stats.feature_summary.to_dict(),
                            "status": DatasetEntity.Status.Analyzed
                         }
                else:
                    update_fields = {"status": DatasetEntity.Status.Failed}
                self.dataset_dao.update_by_name(s, dataset_name, update_fields)

            elif step_type == AnalyzeStep.Types.PatchCorrelation:
                # 1. check dataset status, only analyzed can calc relativity
                dataset = self.dataset_dao.require_by_name(s, dataset_name)
                if dataset.status != AnalyzeStep.Types.Analyzed:
                    raise ValueError(
                        f"Dataset {dataset_name} status is not {AnalyzeStep.Types.Analyzed} ."
                    )

                request_label_col = step.extension.get("label_col")
                if request_label_col != dataset.label_col:
                    raise ValueError(
                        f"Dataset {dataset_name} label col is {dataset.label_col} but received result is for {request_label_col}"
                    )

                # 2. read extension
                corr_dict = step.extension.get('corr')

                # 3. load & update features
                features = dataset.to_dataset_stats().features
                for f in features:
                    correlation = corr_dict.get(f.name)
                    f.correlation = FeatureCorrelation(
                        value=correlation,
                        status=FeatureCorrelation.calc_status(
                            correlation, request_label_col == f.name))

                # 4. sort features by  abs correlation
                features = sorted(features,
                                  key=lambda f: abs(f.correlation.value),
                                  reverse=True)

                feature_dict_list = []
                for f in features:
                    feature_dict_list.append(f.to_dict())

                # 5. push back database
                self.dataset_dao.update_by_name(
                    s, dataset_name, {"features": feature_dict_list})
Exemple #11
0
    def test_binary_classification_model(self):
        # 1. create a dataset
        dataset_name = self.create_dataset()

        # 2. create a train job
        req_dict = \
            {
                "label_col": "readmitted",
                "pos_label": True,
                "train_mode": "minimal",
                "partition_strategy": "cross_validation",
                "holdout_percentage": 20,
                "cross_validation": {
                    "n_folds": 2
                },
                "datetime_series_col": None
            }
        model_name = self.train_model_and_evaluate(dataset_name, req_dict)

        # 1. check model exists
        model_path = util.model_dir(dataset_name, model_name)
        assert P.exists(model_path)

        # 2. predict file
        req_dict = \
            {
                "upload_took": 10,  # 10 just for test
                "file_path": P.abspath('cooka/test/dataset/diabetes_10k_datetime.csv'),
                "reserved_cols": ['race'],
            }

        resp_batch_predict_job = self.assert_response_and_get(
            self.fetch(
                f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}/batch-predict-job',
                method="POST",
                body=util.dumps(req_dict),
                headers=self.DEFAULT_HEADER))
        assert 'batch_predict_job_name' in resp_batch_predict_job
        batch_predict_job_name = resp_batch_predict_job.get(
            'batch_predict_job_name')

        # 3. poll batch job status
        max_poll_times = 31  # try 30 times
        for i in range(max_poll_times):
            time.sleep(1)
            resp_poll_batch_predict_job = self.assert_response_and_get(
                self.fetch(
                    f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}/batch-predict-job/{batch_predict_job_name}',
                    method="GET",
                    headers=self.DEFAULT_HEADER))

            # stop
            if i == max_poll_times - 1:
                raise Exception(util.dumps(resp_poll_batch_predict_job))

            assert "batch_predict_job_name" in resp_poll_batch_predict_job
            assert "steps" in resp_poll_batch_predict_job

            print(resp_poll_batch_predict_job)

            batch_predict_steps = resp_poll_batch_predict_job.get('steps')
            if self.check_batch_predict_steps_finished(batch_predict_steps):
                break

        # 4. check model path correct
        resp_model = self.assert_response_and_get(
            self.fetch(
                f'/api/dataset/{dataset_name}/feature-series/default/model/{model_name}',
                method="GET",
                headers=self.DEFAULT_HEADER))

        # 5. check response
        assert "name" in resp_model
        assert "status" in resp_model
        assert "escaped" in resp_model
        assert "model_file_size" in resp_model
        assert "task_type" in resp_model
        assert "framework" in resp_model
        assert "create_datetime" in resp_model
        assert "last_update_datetime" in resp_model
        assert "log_file_path" in resp_model
        assert "performance" in resp_model
        # todo assert "trails" in resp_model

        performance = resp_model.get('performance')

        assert "metrics" in performance
        assert "confusion_matrix" in performance
        # todo assert "roc_curve" in performance

        metrics = performance.get('metrics')
        assert 'roc_auc' in metrics
        # todo assert 'f1' in metrics

        confusion_matrix = performance.get('confusion_matrix')
        assert "fn" in confusion_matrix
        assert "fp" in confusion_matrix
        assert "tn" in confusion_matrix
        assert "tp" in confusion_matrix

        assert "label" in confusion_matrix
        label_dict = confusion_matrix.get('label')
Exemple #12
0
    def create_temporary_dataset_from_file(self, data_path):
        """This can not in setup_class(cls) nor setUpClass(cls), setUp, because http server not ready.
        """
        super(WithTemporaryDatasetTestCase,
              self).setUp()  # must invoke or not create http server
        from cooka.common import consts
        consts.SERVER_PORTAL = self.get_url('')  # use temporary server

        # 1. upload
        boundary = uuid.uuid4().hex
        headers = {
            "Content-Type": "multipart/form-data; boundary=%s" % boundary
        }
        producer = partial(self.multipart_producer, boundary, data_path)

        upload_response = self.fetch(path='/api/resource',
                                     method="POST",
                                     body_producer=producer,
                                     headers=headers)

        upload_response_body = self.assert_response_and_get(upload_response)
        upload_file_path = upload_response_body.get('path')
        upload_took = upload_response_body.get('took')

        assert upload_file_path is not None
        assert upload_took is not None

        # 2. send request
        body = {
            "sample_strategy": "random_rows",
            "percentage": 30,
            "n_rows": 1000,
            "file_path": upload_file_path,
            "upload_took": upload_took,
            "source_type": "upload",
        }
        str_body = util.dumps(body)

        # 3. validate code
        create_response_body = self.assert_response_and_get(
            self.fetch(path='/api/temporary-dataset',
                       method="POST",
                       body=str_body,
                       headers=headers))
        print(f"create response body:\n {create_response_body}")

        # 3. poll dataset message
        temporary_dataset_name = create_response_body["temporary_dataset_name"]
        analyze_job_name = create_response_body["analyze_job_name"]

        excepted_event = [
            AnalyzeStep.Types.Upload, AnalyzeStep.Types.Load,
            AnalyzeStep.Types.Analyzed
        ]

        analyze_passed = False
        poll_job_response_body = None
        for i in range(10):  # poll for 30 times every time one second
            poll_job_response = self.fetch(
                f'/api/dataset/{temporary_dataset_name}/analyze-job/{analyze_job_name}',
                method="GET")
            poll_job_response_body = self.assert_response_and_get(
                poll_job_response)
            events = poll_job_response_body['steps']
            events.sort(key=lambda x: x['datetime'])
            events_type = [event["type"] for event in events]
            if excepted_event == events_type:  # has all excepted type order by datetime
                # validate every
                analyze_passed = True
                break

            if AnalyzeStep.Types.End in events_type:
                break

            time.sleep(1)

        assert analyze_passed, f"{poll_job_response_body}"

        # 4. retrieve dataset and check detail
        with db.open_session() as s:
            temporary_dataset = s.query(DatasetEntity).filter(
                DatasetEntity.name == temporary_dataset_name).first()
            assert temporary_dataset is not None, f'Temporary dataset = {temporary_dataset_name} create failed'

            assert len(temporary_dataset.extension) > 0
            assert temporary_dataset.status == DatasetEntity.Status.Analyzed

        return temporary_dataset_name