Ejemplo n.º 1
0
    def run_train_job(self, framework, conf: ExperimentConf, no_experiment: int, model_input_features:list, n_rows: int):

        # 1. create train conf
        job_name = f"train_job_{conf.dataset_name}_{framework}_{util.human_datetime()}"

        brevity_framework_dict = {FrameworkType.DeepTables: "dt", FrameworkType.GBM: "gbm"}

        model_name = util.model_name(conf.dataset_name, no_experiment)  #f"{conf.dataset_name}_{no_experiment}"

        model_dir = util.model_dir(conf.dataset_name, model_name)
        os.makedirs(model_dir)
        train_source_code_path = P.join(model_dir, 'train.py')
        train_log = P.join(model_dir, f"train.log")

        train_job_conf = TrainJobConf(framework=framework,
                                      name=job_name,
                                      model_name=model_name,
                                      searcher=TrainJobConf.Searcher.MCTSSearcher,
                                      max_trails=consts.TRAIN_MODE_MAX_TRAILS_MAPPING[conf.train_mode],
                                      search_space=TrainJobConf.SearchSpace.Minimal)
        # 2. insert to db
        with db.open_session() as s:
            self.create_temporary_model(s, model_name, no_experiment, model_input_features, conf,  train_job_conf)

        # 3. generate train source code
        train_source_code, notebook_content = self.generate_code(model_name, model_input_features, n_rows, train_job_conf, conf)

        with open(train_source_code_path, 'w', encoding='utf-8') as f:
            f.write(train_source_code)

        notebook_file_path = P.join(model_dir, 'train.ipynb')
        with open(notebook_file_path, 'w', encoding='utf-8') as f:
            f.write(notebook_content)

        # 4. run train process
        # Note: if plus & at end of command, the process id will be plus 1 cause a bug
        command = f"nohup {sys.executable} {train_source_code_path} 1>{train_log} 2>&1"

        log.info(f"Run train job command: \n{command}")
        log.info(f"Log file:\ntail -f  {train_log}")
        log.info(f"Train source code:\n {train_source_code_path}")

        train_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE)

        with db.open_session() as s:
            self.model_dao.update_model_by_name(s, model_name, {"pid": train_process.pid})

        return train_job_conf.to_dict()
Ejemplo n.º 2
0
    def delete(self, dataset_name):
        with db.open_session() as s:
            # 1. delete record
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            is_temporary = dataset.is_temporary
            self.dataset_dao.delete(s, dataset_name)

            # 2. delete file only not temporary
            if is_temporary is False:
                if "/" not in dataset_name and len(dataset_name) > 1:
                    if len(consts.PATH_DATASET) > 1:
                        dataset_dir = P.join(consts.PATH_DATASET, dataset_name)
                        if P.exists(dataset_dir) and P.isdir(dataset_dir):
                            logger.info(f"Remove file at: {dataset_dir}")
                            shutil.rmtree(dataset_dir)
                        else:
                            raise ValueError(
                                f"dataset dir {dataset_dir} is not dir or not exists, may be a bug here."
                            )
                    else:
                        raise ValueError(
                            "Data dir too short, can not delete. ")
                else:
                    raise ValueError(
                        "dataset name contains '/' or length too short.")
Ejemplo n.º 3
0
    def retrieve(self, dataset_name, n_top_value):
        with db.open_session() as s:
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            dict_value = util.sqlalchemy_obj_to_dict(dataset)
            dict_value['file_path'] = util.relative_path(dataset.file_path)
            # if dataset.status == DatasetEntity.Status.Analyzed:
            #     for i, f in enumerate(dict_value['features']):
            #         if f['type'] in [FeatureType.Categorical, FeatureType.Continuous]:
            #             if f['unique']['value'] > n_top_value:
            #                 # calc top {n_count_value}
            #                 extension = f['extension']
            #                 sorted(extension['value_count'], key=lambda _: _['value'])
            #
            #                 top_value_count = extension['value_count'][: n_top_value]
            #                 remain_value_count = extension['value_count'][n_top_value:]
            #                 remain_count = 0
            #                 for remain_dict in remain_value_count:
            #                     remain_count = remain_count + remain_dict['value']
            #
            #                 top_value_count.append(
            #                     FeatureValueCount(type="Remained_SUM", value=remain_count).to_dict()
            #                 )
            #                 dict_value['features'][i]['extension']['value_count'] = top_value_count
            # extension['value_count'] = top_value_count

            # dict_value['detail'] = dict_value['extension']
            extension = dict_value.pop('extension')
            dict_value['extension'] = {"sample_conf": extension['sample_conf']}
            return dict_value
Ejemplo n.º 4
0
    def brevity_dataset_pagination(self, req_dict):
        # 1. read param
        page_num = util.require_in_dict(req_dict, 'page_num', int, default=1)
        page_size = util.require_in_dict(req_dict,
                                         'page_size',
                                         int,
                                         default=10)
        query = util.get_from_dict(req_dict, 'query', str)
        order_by = util.require_in_dict(req_dict,
                                        'order_by',
                                        str,
                                        default="create_datetime")
        order = util.require_in_dict(req_dict, 'order', str, default="desc")

        allow_order_by_fields = [
            "create_datetime", "n_experiments", "size", "n_rows", "n_cols"
        ]
        if order_by not in allow_order_by_fields:
            raise ValueError(
                f"Order by field should in {','.join(allow_order_by_fields)}, but input is: {order_by}"
            )

        allow_order_strategies = ["desc", "asc"]

        if order not in allow_order_strategies:
            raise ValueError(
                f"order strategy should in {','.join(allow_order_strategies)}, but input is: {order}"
            )

        if page_num < 1:
            raise ValueError("Param page_num should > 1 .")

        if page_size < 0:
            raise ValueError("Param page_size should > 0 .")

        def _handle(model_dao, session, dataset: DatasetEntity):
            d = util.sqlalchemy_obj_to_dict(dataset)
            d['file_path'] = util.relative_path(dataset.file_path)
            d['create_datetime'] = util.to_timestamp(dataset.create_datetime)
            d['n_experiments'] = model_dao.query_n_experiment(
                session, dataset.name)

            del d['features']
            del d['feature_summary']
            del d['extension']

            return d

        # 2. query
        with db.open_session() as s:
            datasets, total = self.dataset_dao.pagination(
                s, page_num, page_size, query, order_by, order)
            datasets = [_handle(self.model_dao, s, d) for d in datasets]
            return datasets, total
Ejemplo n.º 5
0
    def retrieve_model(self, model_name):

        def _replace_NaN(v):
            if v is None:
                return v
            else:
                if math.isnan(v):
                    return None
                else:
                    return v

        with db.open_session() as s:
            model = self.require_model(s, model_name)

        # handle trails
        if model.trails is not None and len(model.trails)>0:
            param_names = [k for k in model.trails[0].params]
            trail_data_dict_list = []

            trail_params_values = []
            trail_index = []
            for t in model.trails:
                param_values = [_replace_NaN(t.params.get(n)) for n in param_names]
                trail_params_values.append(param_values)
                trail_index.append(t.trail_no)
            df_train_params = pd.DataFrame(data=trail_params_values, columns=param_names)
            # remove if all is None
            df_train_params.dropna(axis=1, how='all', inplace=True)

            for i, t in enumerate(model.trails):
                trail_data_dict = {"reward": t.reward, "params": [_replace_NaN(_) for _ in df_train_params.iloc[i].tolist()], "elapsed": t.elapsed}
                trail_data_dict_list.append(trail_data_dict)

            if len(df_train_params.columns.values) > 0:  # ensure not all params is None
                trails_dict = {
                    "param_names": df_train_params.columns.tolist(),
                    "data": trail_data_dict_list
                }
            else:
                trails_dict = {}
        else:
            trails_dict = {}

        model_dict = model.to_dict()
        # update trails
        model_dict['trails'] = trails_dict
        model_dict['model_path'] = util.relative_path(model_dict['model_path'])
        model_dict['escaped'] = model.escaped_time_by_seconds()
        model_dict['log_file_path'] = model.log_file_path()
        model_dict['train_source_code_path'] = model.train_source_code_path()
        model_dict['train_notebook_uri'] = model.train_notebook_uri()

        return model_dict
Ejemplo n.º 6
0
    def create_dataset(self, dataset_name, temporary_dataset_name):
        with db.open_session() as s:
            # 1. check temporary dataset
            temporary_dataset = s.query(DatasetEntity).filter(
                DatasetEntity.name == temporary_dataset_name).first()
            if temporary_dataset is None:
                raise EntityNotExistsException(DatasetEntity,
                                               temporary_dataset_name)
            if temporary_dataset.status != DatasetEntity.Status.Analyzed:
                raise IllegalParamException(
                    'dataset_name', temporary_dataset_name,
                    f'Dataset is not ready, status is {temporary_dataset.status}'
                )

            # 2. check dataset name
            new_dataset_dir = P.join(consts.PATH_DATASET, dataset_name)
            if P.exists(new_dataset_dir):
                raise IllegalParamException(
                    'dataset_name', dataset_name,
                    f'File path {new_dataset_dir} of dataset already exits')

            # 3. Read temporary dataset
            # temporary_dataset_dict = util.loads(temporary_dataset.extension)

            # 4. make dataset dir, can not rollback, but should enough robust below
            new_dataset_dir = P.join(consts.PATH_DATASET, dataset_name)
            os.makedirs(new_dataset_dir, exist_ok=False)

            # 5. move file
            file_path = temporary_dataset.get_abs_file_path()
            new_dataset_file_path = P.join(
                new_dataset_dir, f'data{util.get_file_suffix(file_path)}')
            shutil.copy(file_path, new_dataset_file_path)

            # 6. create meta.json
            # temporary_dataset_dict['name'] = dataset_name
            # temporary_dataset_dict['create_datetime'] = util.get_now_long()
            # with open(P.join(new_dataset_dir, 'meta.json'), 'w') as f:
            #     f.write(util.dumps(temporary_dataset_dict))
            properties = {
                "is_temporary": False,
                "name": dataset_name,
                "file_path": new_dataset_file_path
            }

            # 7. change status
            affect_rows = s.query(DatasetEntity).filter(
                DatasetEntity.name == temporary_dataset_name,
                DatasetEntity.is_temporary == True).update(properties)
            if affect_rows != 1:
                raise Exception("Update dataset failed.")
Ejemplo n.º 7
0
    def predict(self, dataset_name, model_name, req_dict: dict):
        # 1. read params
        file_path = util.require_in_dict(req_dict, 'file_path', str)
        reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list)
        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        if reserved_cols is None or len(reserved_cols) < 1:
            reserved_cols_str = ""
        else:
            reserved_cols_str = ",".join(reserved_cols)

        # 2.  check params
        with db.open_session() as s:
            self.model_dao.require_by_name(s, model_name).to_model_bean()
            dataset_stats = self.dataset_dao.require_by_name(
                s, dataset_name).to_dataset_stats()
        predict_job_name = util.predict_job_name(dataset_name)
        abs_file_path = P.join(consts.DATA_DIR, file_path)
        if not P.exists(abs_file_path):
            raise ValueError(f"Input file not exists: {abs_file_path}")
        if not P.isfile(abs_file_path):
            raise ValueError(f"Input file is not file: {abs_file_path}")

        # 3. add upload step
        upload_extension = {"file_size": P.getsize(abs_file_path)}
        upload_step = JobStep(type=PredictStepType.Upload,
                              status=JobStep.Status.Succeed,
                              took=upload_took,
                              datetime=util.get_now_long(),
                              extension=upload_extension)
        self.add_predict_process_step(model_name, predict_job_name,
                                      upload_step)

        # 4.  execute command
        model_dir = util.model_dir(dataset_name, model_name)
        predict_log_path = P.join(model_dir, f"{predict_job_name}.log")
        if not dataset_stats.has_header:
            default_headers = ",".join(
                [f.name for f in dataset_stats.features])
        else:
            default_headers = None

        command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers}  --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {predict_log_path}")
        os.system(command)  # ha ha ha

        return predict_job_name
Ejemplo n.º 8
0
 def train_process_terminated(self, model_name):
     with db.open_session() as s:
         # because of check database 1 seconds every time fix read is running but before handle_models finished.
         # 1. check status, only running can change to finished
         model = self.model_dao.find_by_name(s, model_name)
         if model.status == ModelStatusType.Running:
             _now = util.get_now_datetime()
             properties = {
                 "status": ModelStatusType.Failed,
                 "finish_datetime": _now,
                 "last_update_datetime": _now
             }
             self.model_dao.update_model_by_name(s, model_name, properties)
         else:
             log.warning(f"Train process is already finished, model = {model_name}")
Ejemplo n.º 9
0
    def get(self, dataset_name, model_name, batch_predict_job_name, *args, **kwargs):
        # 1. query all the message of request_id  todo move to service
        with db.open_session() as s:
            messages = s.query(MessageEntity).filter(MessageEntity.author == batch_predict_job_name).order_by(MessageEntity.create_datetime.asc()).all()
            messages_dict_list = []
            for m in messages:
                messages_dict_list.append(util.loads(m.content))

        # 2. response
        response = \
            {
                "batch_predict_job_name": batch_predict_job_name,
                "steps": messages_dict_list
            }
        self.response_json(response)
Ejemplo n.º 10
0
    def _handle_label_col(self, dataset_name, label_col, file_path):
        # calc correlation
        # 1. update label col, Avoiding that http request send first and not label_col not updated
        with db.open_session() as s:
            self.dataset_dao.update_by_name(s, dataset_name, {"label_col": label_col})

        # 2. start a process
        analyze_pearson_job_name = util.analyze_data_job_name(P.basename(file_path))
        std_log = P.join(util.dataset_dir(dataset_name), f"{analyze_pearson_job_name}.log")

        command = f"nohup {sys.executable} {util.script_path('analyze_correlation_job.py')} --dataset_name={dataset_name} --label_col={label_col} --job_name={analyze_pearson_job_name} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1"
        calc_correlation_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE)

        log.info(f"Run calculate pearson command: \n{command}")
        log.info(f"Log file:\ntail -f  {std_log}")
        log.info(f"Process id is {calc_correlation_process.pid}")
Ejemplo n.º 11
0
    def post(self, dataset_name, *args, **kwargs):

        # check it in db
        with db.open_session() as s:
            dataset = self.dataset_service.dataset_dao.find_by_name(
                s, dataset_name)
            if dataset is not None:
                raise ValueError(f"Dataset {dataset_name} already exists ")

        # check it in file system
        dataset_dir = util.dataset_dir(dataset_name)
        if P.exists(dataset_dir):
            raise ValueError(
                f"Path {dataset_dir} already exists even dataset {dataset_name} not exists "
            )

        self.response_json({})
Ejemplo n.º 12
0
    def get_experiments(self, dataset_name, page_num, page_size):
        # 1. validation params
        if page_num < 1:
            raise ValueError("Param page_num should >= 1'")

        if page_size < 1:
            raise ValueError("Param page_size should >= 1'")

        def f(model: Model):
            model_extension = model.extension
            train_mode = model_extension['experiment_conf']["train_mode"]  # extension must has experiment_conf and train_mode and not None
            max_trails = consts.TRAIN_MODE_MAX_TRAILS_MAPPING[train_mode]  # must has {train_mode}
            d = \
                {
                    "name": model.name,
                    "no_experiment": m.no_experiment,
                    "train_mode": m.extension['experiment_conf']['train_mode'],
                    "target_col": m.extension['experiment_conf']['label_col'],
                    "metric_name": consts.TASK_TYPE_OPTIMIZE_METRIC_MAPPING[m.task_type],   # m.extension['experiment_conf']['optimize_metric'],
                    "status": model.status,
                    "score": model.score,
                    "engine": m.extension['train_job_conf']['framework'],
                    "escaped": model.escaped_time_by_seconds(),
                    "log_file_path": model.log_file_path(),
                    "train_source_code_path": model.train_source_code_path(),
                    "train_notebook_uri": model.train_notebook_uri(),
                    "train_trail_no": 0 if model.train_trail_no is None else model.train_trail_no,
                    "max_train_trail_no": max_trails,
                    "estimated_remaining_time": ExperimentService.calc_remain_time(model),
                    "model_file_size": model.model_file_size
                }
            return d

        with db.open_session() as s:
            # check dataset
            dataset = self.dataset_dao.require_by_name(s, dataset_name)

            models, total = self.model_dao.find_by_dataset_name(s, dataset_name, page_num, page_size)
            experiments = []
            for m in models:
                model_as_dict = f(m)
                experiments.append(model_as_dict)

            # return sorted(experiments, key=lambda x: x['no_experiment'], reverse=True)
            return experiments, total
Ejemplo n.º 13
0
    def get(self, dataset_name, analyze_job_name, *args, **kwargs):
        # 1. validate param
        if analyze_job_name is None:
            raise IllegalParamException("analyze_job_name", None, "not empty")

        # 2. query all the message of request_id  todo move to service
        with db.open_session() as s:
            messages = s.query(MessageEntity).filter(
                MessageEntity.author == analyze_job_name).order_by(
                    MessageEntity.create_datetime.asc()).all()
            messages_dict_list = []
            for m in messages:
                messages_dict_list.append(util.loads(m.content))

        # 3. response
        response = \
            {
                "analyze_job_name": analyze_job_name,
                "steps": messages_dict_list
            }
        self.response_json(response)
Ejemplo n.º 14
0
    def recommended_train_configuration(self, dataset_name, req_dict):
        datetime_series_col = None  # todo support datetime_series
        target_col = req_dict.get('target_col')

        # 1. read last train-job
        with db.open_session() as s:
            # 1.1. check dataset name
            dataset_stats = self.dataset_dao.require_by_name(s, dataset_name).to_dataset_stats()
            # 1.2. query models
            last_model = self.model_dao.checkout_one(self.model_dao.find_by_dataset_name(s, dataset_name, 1, 1)[0])

        # 2. infer conf
        if last_model is None:
            experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col)
        else:
            _experiment_conf: ExperimentConf = self.get_recommended_conf_from_history(last_model)
            if target_col is not None and _experiment_conf.label_col != target_col:
                log.info(f"Change label from {_experiment_conf.label_col} to {target_col}, " +
                         f"and recommended params using new target.")
                experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col)
            else:
                # use history
                experiment_conf = _experiment_conf

        train_validation_holdout = experiment_conf.train_validation_holdout
        cross_validation = experiment_conf.cross_validation

        conf = \
            {
                "label_col": experiment_conf.label_col,
                "task_type": experiment_conf.task_type,
                "pos_label": experiment_conf.pos_label,
                "train_mode": experiment_conf.train_mode,
                "partition_strategy": ExperimentConf.PartitionStrategy.TrainValidationHoldout,
                "train_validation_holdout": train_validation_holdout.to_dict() if train_validation_holdout is not None else None,
                "cross_validation": cross_validation.to_dict() if cross_validation is not None else None,
                "datetime_series_col": datetime_series_col
            }
        return conf
Ejemplo n.º 15
0
    def add_predict_process_step(self, model_name: str, job_name: str,
                                 step: JobStep):
        step_type = step.type
        with db.open_session() as s:
            # 1.  check temporary model exists
            model = self.model_dao.require_by_name(s, model_name)

            # 2. check event type, one type one record
            messages = s.query(MessageEntity).filter(
                MessageEntity.author == job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    raise Exception(
                        f"Event type = {step_type} already exists .")

            # 3. create a new message
            content = util.dumps(step.to_dict())
            message = MessageEntity(id=util.short_uuid(),
                                    author=job_name,
                                    content=content,
                                    create_datetime=util.get_now_datetime())
            s.add(message)
Ejemplo n.º 16
0
    def post(self, dataset_name, *args, **kwargs):
        req_dict = self.get_request_as_dict_if_json()

        feature_name = util.require_in_dict(req_dict, 'feature_name', str)

        with db.open_session() as s:
            dataset = self.dataset_service.dataset_dao.require_by_name(
                s, dataset_name)
            features = Feature.load_dict_list(dataset.features)
            target_f = None
            for f in features:
                if f.name == feature_name:
                    target_f = f
                    break

            if target_f is None:
                raise ValueError(f"Feature name = {feature_name} not found. ")

            task_type = self.experiment_service._infer_task_type(target_f)

        resp = {"task_type": task_type, "feature_name": feature_name}

        self.response_json(resp)
Ejemplo n.º 17
0
    def create_temporary_dataset_from_file(self, data_path):
        """This can not in setup_class(cls) nor setUpClass(cls), setUp, because http server not ready.
        """
        super(WithTemporaryDatasetTestCase,
              self).setUp()  # must invoke or not create http server
        from cooka.common import consts
        consts.SERVER_PORTAL = self.get_url('')  # use temporary server

        # 1. upload
        boundary = uuid.uuid4().hex
        headers = {
            "Content-Type": "multipart/form-data; boundary=%s" % boundary
        }
        producer = partial(self.multipart_producer, boundary, data_path)

        upload_response = self.fetch(path='/api/resource',
                                     method="POST",
                                     body_producer=producer,
                                     headers=headers)

        upload_response_body = self.assert_response_and_get(upload_response)
        upload_file_path = upload_response_body.get('path')
        upload_took = upload_response_body.get('took')

        assert upload_file_path is not None
        assert upload_took is not None

        # 2. send request
        body = {
            "sample_strategy": "random_rows",
            "percentage": 30,
            "n_rows": 1000,
            "file_path": upload_file_path,
            "upload_took": upload_took,
            "source_type": "upload",
        }
        str_body = util.dumps(body)

        # 3. validate code
        create_response_body = self.assert_response_and_get(
            self.fetch(path='/api/temporary-dataset',
                       method="POST",
                       body=str_body,
                       headers=headers))
        print(f"create response body:\n {create_response_body}")

        # 3. poll dataset message
        temporary_dataset_name = create_response_body["temporary_dataset_name"]
        analyze_job_name = create_response_body["analyze_job_name"]

        excepted_event = [
            AnalyzeStep.Types.Upload, AnalyzeStep.Types.Load,
            AnalyzeStep.Types.Analyzed
        ]

        analyze_passed = False
        poll_job_response_body = None
        for i in range(10):  # poll for 30 times every time one second
            poll_job_response = self.fetch(
                f'/api/dataset/{temporary_dataset_name}/analyze-job/{analyze_job_name}',
                method="GET")
            poll_job_response_body = self.assert_response_and_get(
                poll_job_response)
            events = poll_job_response_body['steps']
            events.sort(key=lambda x: x['datetime'])
            events_type = [event["type"] for event in events]
            if excepted_event == events_type:  # has all excepted type order by datetime
                # validate every
                analyze_passed = True
                break

            if AnalyzeStep.Types.End in events_type:
                break

            time.sleep(1)

        assert analyze_passed, f"{poll_job_response_body}"

        # 4. retrieve dataset and check detail
        with db.open_session() as s:
            temporary_dataset = s.query(DatasetEntity).filter(
                DatasetEntity.name == temporary_dataset_name).first()
            assert temporary_dataset is not None, f'Temporary dataset = {temporary_dataset_name} create failed'

            assert len(temporary_dataset.extension) > 0
            assert temporary_dataset.status == DatasetEntity.Status.Analyzed

        return temporary_dataset_name
Ejemplo n.º 18
0
 def find_running_model(self):
     with db.open_session() as s:
         return self.model_dao.find_running_model(s)
Ejemplo n.º 19
0
    def add_analyze_process_step(self, dataset_name, analyze_job_name,
                                 step: JobStep):
        step_type = step.type
        with db.open_session() as s:
            # 1.1.  check dataset exists
            d = s.query(DatasetEntity).filter(
                DatasetEntity.name == dataset_name).first()
            if d is None:
                raise EntityNotExistsException(DatasetEntity, dataset_name)

            # 1.2. check event type, one type one record
            messages = s.query(MessageEntity).filter(
                MessageEntity.author == analyze_job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    raise Exception(
                        f"Event type = {step_type} already exists .")

        # 2. handle event
        with db.open_session() as s:
            # 2.1. create a new message
            content = util.dumps(step.to_dict())
            message = MessageEntity(id=util.short_uuid(),
                                    author=analyze_job_name,
                                    content=content,
                                    create_datetime=util.get_now_datetime())
            s.add(message)

            # 2.2. handle analyze event
            if step_type == AnalyzeStep.Types.Analyzed:
                # update temporary dataset
                # todo handle failed analyze
                if step.status == JobStep.Status.Succeed:
                    hints = step.extension.pop("hints")
                    d_stats = DatasetStats.load_dict(step.extension)

                    features_str = [f.to_dict() for f in d_stats.features]
                    update_fields = \
                        {
                            "has_header": d_stats.has_header,
                            "extension": step.extension,
                            "n_cols": d_stats.n_cols,
                            "n_rows": d_stats.n_rows,
                            "features": features_str,
                            "hints": hints,
                            "feature_summary": d_stats.feature_summary.to_dict(),
                            "status": DatasetEntity.Status.Analyzed
                         }
                else:
                    update_fields = {"status": DatasetEntity.Status.Failed}
                self.dataset_dao.update_by_name(s, dataset_name, update_fields)

            elif step_type == AnalyzeStep.Types.PatchCorrelation:
                # 1. check dataset status, only analyzed can calc relativity
                dataset = self.dataset_dao.require_by_name(s, dataset_name)
                if dataset.status != AnalyzeStep.Types.Analyzed:
                    raise ValueError(
                        f"Dataset {dataset_name} status is not {AnalyzeStep.Types.Analyzed} ."
                    )

                request_label_col = step.extension.get("label_col")
                if request_label_col != dataset.label_col:
                    raise ValueError(
                        f"Dataset {dataset_name} label col is {dataset.label_col} but received result is for {request_label_col}"
                    )

                # 2. read extension
                corr_dict = step.extension.get('corr')

                # 3. load & update features
                features = dataset.to_dataset_stats().features
                for f in features:
                    correlation = corr_dict.get(f.name)
                    f.correlation = FeatureCorrelation(
                        value=correlation,
                        status=FeatureCorrelation.calc_status(
                            correlation, request_label_col == f.name))

                # 4. sort features by  abs correlation
                features = sorted(features,
                                  key=lambda f: abs(f.correlation.value),
                                  reverse=True)

                feature_dict_list = []
                for f in features:
                    feature_dict_list.append(f.to_dict())

                # 5. push back database
                self.dataset_dao.update_by_name(
                    s, dataset_name, {"features": feature_dict_list})
Ejemplo n.º 20
0
    def add_train_process_step(self, train_job_name, req_dict):
        # [1]. read & check params
        step_type = util.require_in_dict(req_dict, 'type', str)
        step_status = util.require_in_dict(req_dict, 'status', str)
        step_extension = util.get_from_dict(req_dict, 'extension', dict)

        if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]:
            raise ValueError(f"Unknown step type = {step_type}")

        if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]:
            raise ValueError(f"Unknown status = {step_status}")

        # [2]. save message
        with db.open_session() as s:
            # [2.1].  check temporary model exists
            model = self.model_dao.find_by_train_job_name(s, train_job_name)
            model_name = model.name
            # [2.2]. check event type, one type one record
            messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]:
                        raise Exception(f"Event type = {step_type} already exists .")

            # [2.3]. create a new message
            content = util.dumps(req_dict)
            message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime())
            s.add(message)

            # [2.4]. handle analyze event
            current_progress = model.progress
            # todo check in code body self._check_progress_change(step_type, current_progress)  # add failed status
            if step_type == TrainStep.Types.Evaluate:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"performance": step_extension['performance']})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.Load:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.OptimizeStart:
                pass
                # train_trail_no = step_extension.get('trail_no')
                # if train_trail_no is None or not isinstance(train_trail_no, int):
                #     raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}")
                # # upload trail number
                # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no})

            elif step_type == TrainStep.Types.Optimize:
                train_trail_no = step_extension.get('trail_no')
                # update trails
                # load current trail and append new
                trails = model.trails
                if model.trails is None:
                    trails = []
                trails.append(step_extension)
                self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails})

            elif step_type == TrainStep.Types.Persist:
                model_file_size = step_extension['model_file_size']
                self._update_model(s, model_name, step_type, {"model_file_size": model_file_size,
                                                              "status": ModelStatusType.Succeed,
                                                              "finish_datetime": util.get_now_datetime()})
            else:
                self._update_model(s, model_name, step_type, {})
Ejemplo n.º 21
0
 def is_dataset_exists(self, dataset_name):
     with db.open_session() as s:
         d = self.dataset_dao.find_by_name(s, dataset_name)
         exists_in_db = d is not None
         exists_file = P.exists(P.join(consts.PATH_DATABASE, dataset_name))
         return exists_in_db or exists_file
Ejemplo n.º 22
0
    def _create_temporary_dataset(self, source_type, file_path, took,
                                  sample_conf: SampleConf):
        now = util.get_now_datetime()
        file_name = P.basename(file_path)
        temporary_dataset_name = self.choose_temporary_dataset_name(
            file_name)  # use a long name
        analyze_job_name = util.analyze_data_job_name(
            util.cut_suffix(file_name), now)
        file_size = P.getsize(file_path)

        # 2. create record
        td = DatasetEntity(name=temporary_dataset_name,
                           file_size=file_size,
                           is_temporary=True,
                           status=DatasetEntity.Status.Created,
                           source_type=source_type,
                           file_path=file_path,
                           file_name=file_name,
                           create_datetime=now,
                           last_update_datetime=now)
        with db.open_session() as s:
            s.add(td)

        # 3. send  file transfer step
        if source_type == DatasetEntity.SourceType.Upload:
            step = JobStep(type=AnalyzeStep.Types.Upload,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)
        elif source_type == DatasetEntity.SourceType.Import:
            step = JobStep(type=AnalyzeStep.Types.Copy,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)

        # 4. create analyze config
        conf = AnalyzeJobConf(job_name=analyze_job_name,
                              dataset_name=temporary_dataset_name,
                              sample_conf=sample_conf,
                              path=file_path,
                              temporary_dataset=True,
                              label_col=None)

        # 5. start new process
        analyze_config_string = util.dumps(conf.to_dict())
        logger.info(f"Analyze job conf: {analyze_config_string}")

        python_executable = sys.executable

        temporary_dataset_dir = util.temporary_dataset_dir(
            temporary_dataset_name)

        os.makedirs(temporary_dataset_dir, exist_ok=True)

        std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log")

        command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {std_log}")

        # JobManager.instance().run_job(job)
        os.system(command)  # ha ha ha

        return temporary_dataset_name, analyze_job_name
Ejemplo n.º 23
0
    def preview(self, dataset_name: str, page_num: int,
                page_size: int) -> RespPreviewDataset:
        """
        Args:
            dataset_name:
            page_num: start from 1
            page_size:

        Returns:

        """
        # 1. validation params
        if page_num < 1:
            raise ValueError("Param page_num should >= 1'")

        if page_size < 1:
            raise ValueError("Param page_size should >= 1'")

        # 2. retrieve dataset
        with db.open_session() as s:
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            file_path = dataset.file_path
            if not P.exists(file_path):
                raise FileNotFoundError(file_path)
            dataset_stats = dataset.to_dataset_stats()

        relative_file_path = util.relative_path(dataset_stats.file_path)

        # 3. read data
        dataset_headers = [f.name for f in dataset_stats.features]
        dataset_headers.insert(0, "No. ")
        # dataset_headers.insert(0, "number")
        if dataset_stats.has_header:
            iterator_df = pd.read_csv(file_path, chunksize=page_size)
        else:
            iterator_df = pd.read_csv(file_path,
                                      chunksize=page_size,
                                      header=None)

        # 4. seek pages, page num start from 1
        # e.g. if page_num = 1 while loop will do 0 times, below code will invoke next(iterator_df) and get data
        current_page = 1
        while current_page < page_num:
            try:
                next(iterator_df)  # no Reference, will be gc
                current_page = current_page + 1
            except StopIteration:
                # if page_num is too large , no data returned
                return RespPreviewDataset(headers=dataset_headers,
                                          rows=None,
                                          count=dataset_stats.n_rows,
                                          file_path=relative_file_path)

        # 5. hit data
        try:
            page_df: pd.DataFrame = next(iterator_df)
            # 5.1. make index
            start_line_no = (current_page - 1) * page_size + 1  # start from 1
            df_index = page_df.index = pd.RangeIndex(
                start_line_no, start_line_no + page_df.shape[0])
            page_df.index = df_index
            values = page_df.to_records(index=True).tolist()
            return RespPreviewDataset(headers=dataset_headers,
                                      rows=values,
                                      count=dataset_stats.n_rows,
                                      file_path=relative_file_path)
        except StopIteration:
            return RespPreviewDataset(headers=dataset_headers,
                                      rows=None,
                                      count=dataset_stats.n_rows,
                                      file_path=relative_file_path)
Ejemplo n.º 24
0
    def experiment(self, req_dict: dict):
        # 1. read params
        label_col = util.require_in_dict(req_dict, 'label_col', str)
        pos_label = util.get_from_dict(req_dict, 'pos_label', object)
        train_mode = util.get_from_dict(req_dict, 'train_mode', str)

        partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str)
        dataset_name = util.require_in_dict(req_dict, 'dataset_name', str)

        holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int)

        # todo check datetime_series_col
        datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str)

        experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str)
        if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]:
            raise ValueError(f"Unseen experiment_engine {experiment_engine}")

        # 2. check partition_strategy
        cross_validation = None
        train_validation_holdout = None
        if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation:
            cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict)
            n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int)
            if 1 < n_folds <= 50:
                cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage)
            else:
                raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}")
        elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout:
            train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict)
            train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int)
            validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int)
            if train_percentage + validation_percentage + holdout_percentage != 100:
                raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.")
            train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage)
        else:
            raise ValueError(f"Unknown partition strategy = {partition_strategy}")

        # 2. Retrieve data
        with db.open_session() as s:
            # 2.1. check dataset
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            if dataset is None:
                raise ValueError(f"Dataset={dataset_name} not exists.")
            dataset_stats = dataset.to_dataset_stats()

            # 2.2. generate new experiment name
            no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1

        # 3. ensure dataset label is latest
        if dataset_stats.label_col is None:
            log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        if dataset_stats.label_col != label_col:
            log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        # 4. calc task type
        # 4.1. find label
        label_f = self._find_feature(dataset_stats.features, label_col)
        if label_f is None:
            raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .")

        task_type = self._infer_task_type(label_f)

        # 4.2. check pos_label
        if task_type == TaskType.BinaryClassification:
            if pos_label is None:
                raise ValueError("Pos label can not be None when it's binary-classify")
            else:
                if isinstance(pos_label, str):
                    if len(pos_label) < 1:
                        raise ValueError("Pos label can not be empty when it's binary-classify")

        # 5. run experiment
        if not dataset_stats.has_header:
            dataset_default_headers = [f.name for f in dataset_stats.features]
        else:
            dataset_default_headers = None

        conf = ExperimentConf(dataset_name=dataset_name,
                              dataset_has_header=dataset_stats.has_header,
                              dataset_default_headers=dataset_default_headers,
                              train_mode=train_mode,
                              label_col=label_col,
                              pos_label=pos_label,
                              task_type=task_type,
                              partition_strategy=partition_strategy,
                              cross_validation=cross_validation,
                              train_validation_holdout=train_validation_holdout,
                              datetime_series_col=datetime_series_col,
                              file_path=dataset_stats.file_path)

        model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features)))

        if experiment_engine == FrameworkType.GBM:
            train_conf = self.run_train_job(FrameworkType.GBM, conf,
                                            no_experiment, model_input_features,  dataset_stats.n_rows)
        else:
            train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features,
                                               dataset_stats.n_rows)

        return {
            "no_experiment": no_experiment,
            "experiment_conf": conf.to_dict(),
            "train_job_conf": train_conf
        }