コード例 #1
0
    def brevity_dataset_pagination(self, req_dict):
        # 1. read param
        page_num = util.require_in_dict(req_dict, 'page_num', int, default=1)
        page_size = util.require_in_dict(req_dict,
                                         'page_size',
                                         int,
                                         default=10)
        query = util.get_from_dict(req_dict, 'query', str)
        order_by = util.require_in_dict(req_dict,
                                        'order_by',
                                        str,
                                        default="create_datetime")
        order = util.require_in_dict(req_dict, 'order', str, default="desc")

        allow_order_by_fields = [
            "create_datetime", "n_experiments", "size", "n_rows", "n_cols"
        ]
        if order_by not in allow_order_by_fields:
            raise ValueError(
                f"Order by field should in {','.join(allow_order_by_fields)}, but input is: {order_by}"
            )

        allow_order_strategies = ["desc", "asc"]

        if order not in allow_order_strategies:
            raise ValueError(
                f"order strategy should in {','.join(allow_order_strategies)}, but input is: {order}"
            )

        if page_num < 1:
            raise ValueError("Param page_num should > 1 .")

        if page_size < 0:
            raise ValueError("Param page_size should > 0 .")

        def _handle(model_dao, session, dataset: DatasetEntity):
            d = util.sqlalchemy_obj_to_dict(dataset)
            d['file_path'] = util.relative_path(dataset.file_path)
            d['create_datetime'] = util.to_timestamp(dataset.create_datetime)
            d['n_experiments'] = model_dao.query_n_experiment(
                session, dataset.name)

            del d['features']
            del d['feature_summary']
            del d['extension']

            return d

        # 2. query
        with db.open_session() as s:
            datasets, total = self.dataset_dao.pagination(
                s, page_num, page_size, query, order_by, order)
            datasets = [_handle(self.model_dao, s, d) for d in datasets]
            return datasets, total
コード例 #2
0
    def predict(self, dataset_name, model_name, req_dict: dict):
        # 1. read params
        file_path = util.require_in_dict(req_dict, 'file_path', str)
        reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list)
        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        if reserved_cols is None or len(reserved_cols) < 1:
            reserved_cols_str = ""
        else:
            reserved_cols_str = ",".join(reserved_cols)

        # 2.  check params
        with db.open_session() as s:
            self.model_dao.require_by_name(s, model_name).to_model_bean()
            dataset_stats = self.dataset_dao.require_by_name(
                s, dataset_name).to_dataset_stats()
        predict_job_name = util.predict_job_name(dataset_name)
        abs_file_path = P.join(consts.DATA_DIR, file_path)
        if not P.exists(abs_file_path):
            raise ValueError(f"Input file not exists: {abs_file_path}")
        if not P.isfile(abs_file_path):
            raise ValueError(f"Input file is not file: {abs_file_path}")

        # 3. add upload step
        upload_extension = {"file_size": P.getsize(abs_file_path)}
        upload_step = JobStep(type=PredictStepType.Upload,
                              status=JobStep.Status.Succeed,
                              took=upload_took,
                              datetime=util.get_now_long(),
                              extension=upload_extension)
        self.add_predict_process_step(model_name, predict_job_name,
                                      upload_step)

        # 4.  execute command
        model_dir = util.model_dir(dataset_name, model_name)
        predict_log_path = P.join(model_dir, f"{predict_job_name}.log")
        if not dataset_stats.has_header:
            default_headers = ",".join(
                [f.name for f in dataset_stats.features])
        else:
            default_headers = None

        command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers}  --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {predict_log_path}")
        os.system(command)  # ha ha ha

        return predict_job_name
コード例 #3
0
    def add_train_process_step(self, train_job_name, req_dict):
        # [1]. read & check params
        step_type = util.require_in_dict(req_dict, 'type', str)
        step_status = util.require_in_dict(req_dict, 'status', str)
        step_extension = util.get_from_dict(req_dict, 'extension', dict)

        if step_type not in [TrainStep.Types.Load, TrainStep.Types.Optimize, TrainStep.Types.OptimizeStart, TrainStep.Types.Persist, TrainStep.Types.Evaluate, TrainStep.Types.FinalTrain, TrainStep.Types.Searched]:
            raise ValueError(f"Unknown step type = {step_type}")

        if step_status not in [JobStep.Status.Succeed, JobStep.Status.Failed]:
            raise ValueError(f"Unknown status = {step_status}")

        # [2]. save message
        with db.open_session() as s:
            # [2.1].  check temporary model exists
            model = self.model_dao.find_by_train_job_name(s, train_job_name)
            model_name = model.name
            # [2.2]. check event type, one type one record
            messages = s.query(MessageEntity).filter(MessageEntity.author == train_job_name).all()
            for m in messages:
                if step_type == util.loads(m.content).get('type'):
                    if step_type not in [TrainStep.Types.OptimizeStart, TrainStep.Types.Optimize]:
                        raise Exception(f"Event type = {step_type} already exists .")

            # [2.3]. create a new message
            content = util.dumps(req_dict)
            message = MessageEntity(id=util.short_uuid(), author=train_job_name, content=content, create_datetime=util.get_now_datetime())
            s.add(message)

            # [2.4]. handle analyze event
            current_progress = model.progress
            # todo check in code body self._check_progress_change(step_type, current_progress)  # add failed status
            if step_type == TrainStep.Types.Evaluate:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"performance": step_extension['performance']})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.Load:
                if step_status == JobStep.Status.Succeed:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Running})
                else:
                    self._update_model(s, model_name, step_type, {"status": ModelStatusType.Failed, "finish_datetime": util.get_now_datetime()})

            elif step_type == TrainStep.Types.OptimizeStart:
                pass
                # train_trail_no = step_extension.get('trail_no')
                # if train_trail_no is None or not isinstance(train_trail_no, int):
                #     raise ValueError(f"Param trail_no can not be None and should be int but is : {train_trail_no}")
                # # upload trail number
                # self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no})

            elif step_type == TrainStep.Types.Optimize:
                train_trail_no = step_extension.get('trail_no')
                # update trails
                # load current trail and append new
                trails = model.trails
                if model.trails is None:
                    trails = []
                trails.append(step_extension)
                self._update_model(s, model_name, step_type, {"train_trail_no": train_trail_no, "score": step_extension.get('reward'), "trails": trails})

            elif step_type == TrainStep.Types.Persist:
                model_file_size = step_extension['model_file_size']
                self._update_model(s, model_name, step_type, {"model_file_size": model_file_size,
                                                              "status": ModelStatusType.Succeed,
                                                              "finish_datetime": util.get_now_datetime()})
            else:
                self._update_model(s, model_name, step_type, {})
コード例 #4
0
    def experiment(self, req_dict: dict):
        # 1. read params
        label_col = util.require_in_dict(req_dict, 'label_col', str)
        pos_label = util.get_from_dict(req_dict, 'pos_label', object)
        train_mode = util.get_from_dict(req_dict, 'train_mode', str)

        partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str)
        dataset_name = util.require_in_dict(req_dict, 'dataset_name', str)

        holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int)

        # todo check datetime_series_col
        datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str)

        experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str)
        if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]:
            raise ValueError(f"Unseen experiment_engine {experiment_engine}")

        # 2. check partition_strategy
        cross_validation = None
        train_validation_holdout = None
        if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation:
            cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict)
            n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int)
            if 1 < n_folds <= 50:
                cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage)
            else:
                raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}")
        elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout:
            train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict)
            train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int)
            validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int)
            if train_percentage + validation_percentage + holdout_percentage != 100:
                raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.")
            train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage)
        else:
            raise ValueError(f"Unknown partition strategy = {partition_strategy}")

        # 2. Retrieve data
        with db.open_session() as s:
            # 2.1. check dataset
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            if dataset is None:
                raise ValueError(f"Dataset={dataset_name} not exists.")
            dataset_stats = dataset.to_dataset_stats()

            # 2.2. generate new experiment name
            no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1

        # 3. ensure dataset label is latest
        if dataset_stats.label_col is None:
            log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        if dataset_stats.label_col != label_col:
            log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        # 4. calc task type
        # 4.1. find label
        label_f = self._find_feature(dataset_stats.features, label_col)
        if label_f is None:
            raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .")

        task_type = self._infer_task_type(label_f)

        # 4.2. check pos_label
        if task_type == TaskType.BinaryClassification:
            if pos_label is None:
                raise ValueError("Pos label can not be None when it's binary-classify")
            else:
                if isinstance(pos_label, str):
                    if len(pos_label) < 1:
                        raise ValueError("Pos label can not be empty when it's binary-classify")

        # 5. run experiment
        if not dataset_stats.has_header:
            dataset_default_headers = [f.name for f in dataset_stats.features]
        else:
            dataset_default_headers = None

        conf = ExperimentConf(dataset_name=dataset_name,
                              dataset_has_header=dataset_stats.has_header,
                              dataset_default_headers=dataset_default_headers,
                              train_mode=train_mode,
                              label_col=label_col,
                              pos_label=pos_label,
                              task_type=task_type,
                              partition_strategy=partition_strategy,
                              cross_validation=cross_validation,
                              train_validation_holdout=train_validation_holdout,
                              datetime_series_col=datetime_series_col,
                              file_path=dataset_stats.file_path)

        model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features)))

        if experiment_engine == FrameworkType.GBM:
            train_conf = self.run_train_job(FrameworkType.GBM, conf,
                                            no_experiment, model_input_features,  dataset_stats.n_rows)
        else:
            train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features,
                                               dataset_stats.n_rows)

        return {
            "no_experiment": no_experiment,
            "experiment_conf": conf.to_dict(),
            "train_job_conf": train_conf
        }
コード例 #5
0
    def create_temporary_dataset(self, req_dict):
        sample_strategy = util.require_in_dict(req_dict, 'sample_strategy',
                                               str, 'random_rows')
        if SampleConf.Strategy.Percentage == sample_strategy:
            percentage = util.get_from_dict(req_dict, 'percentage', int, 30)
            n_rows = None
        elif SampleConf.Strategy.RandomRows == sample_strategy:
            n_rows = util.get_from_dict(req_dict, 'n_rows', int, 1000)
            percentage = None
        elif SampleConf.Strategy.WholeData == sample_strategy:
            n_rows = None
            percentage = None
        else:
            raise ValueError(f"Not support sample strategy: {sample_strategy}")

        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        file_path = util.require_in_dict(req_dict, 'file_path', str)
        source_type = util.require_in_dict(req_dict, 'source_type', str)

        sample_conf = SampleConf(sample_strategy=sample_strategy,
                                 percentage=percentage,
                                 n_rows=n_rows)

        # 1. validate param
        if source_type not in [
                DatasetEntity.SourceType.Upload,
                DatasetEntity.SourceType.Import
        ]:
            raise IllegalParamException(
                'source_type', source_type,
                f'Should in {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}'
            )

        if source_type == DatasetEntity.SourceType.Upload:
            upload_file_prefix = P.join(consts.FIELD_TMP, consts.FIELD_UPLOAD)
            if not file_path.startswith(upload_file_prefix):
                raise ValueError(
                    f"For upload file should path should start with {upload_file_prefix} but it's {file_path}"
                )
            else:
                # fix relative path
                file_path = P.join(consts.DATA_DIR, file_path)

        if not P.exists(file_path):
            raise ValueError(f"File={file_path} not exists")

        if not P.isfile(file_path):
            raise ValueError(f"File={file_path} is not a file")

        util.validate_sample_conf(sample_conf)

        # 2. create
        if source_type == DatasetEntity.SourceType.Upload:
            return self._create_temporary_dataset(source_type, file_path,
                                                  upload_took, sample_conf)
        elif source_type == DatasetEntity.SourceType.Import:
            t1 = time.time()
            internal_path = util.temporary_upload_file_path(
                P.basename(file_path))
            os.makedirs(P.dirname(internal_path), exist_ok=True)
            shutil.copy(file_path, internal_path)
            took = time.time() - t1
            logger.info(f"Copy file to {internal_path}")
            return self._create_temporary_dataset(source_type, internal_path,
                                                  took, sample_conf)
        else:
            raise IllegalParamException(
                'source_type', source_type,
                f'should one of {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}'
            )