Exemple #1
0
 def handle_models(self, models: list):
     for m in models:
         m: Model = m
         pid = m.pid
         if pid is None:
             pass
             # logger.warning(f"Model {m.name} , training process pid is None. ")
         else:
             try:
                 status = psutil.Process(pid).status()
                 if pid not in self.process_status_mapping:
                     self.process_status_mapping[pid] = status
                     logger.info(
                         f"Model {m.name} , pid is {pid} process status is {status} "
                     )
                 else:
                     if self.process_status_mapping[pid] != status:
                         logger.info(
                             f"Model {m.name} , pid is {pid} process status changed from{ self.process_status_mapping[pid] } to {status} "
                         )
                         self.process_status_mapping[pid] = status
             except Exception as e:  # usually is NoSuchProcess
                 # update if process finished
                 logger.warning(
                     f"Model {m.name} , training process pid = {pid} not exists. "
                 )
                 self.experiment_service.train_process_terminated(m.name)
Exemple #2
0
    def delete(self, dataset_name):
        with db.open_session() as s:
            # 1. delete record
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            is_temporary = dataset.is_temporary
            self.dataset_dao.delete(s, dataset_name)

            # 2. delete file only not temporary
            if is_temporary is False:
                if "/" not in dataset_name and len(dataset_name) > 1:
                    if len(consts.PATH_DATASET) > 1:
                        dataset_dir = P.join(consts.PATH_DATASET, dataset_name)
                        if P.exists(dataset_dir) and P.isdir(dataset_dir):
                            logger.info(f"Remove file at: {dataset_dir}")
                            shutil.rmtree(dataset_dir)
                        else:
                            raise ValueError(
                                f"dataset dir {dataset_dir} is not dir or not exists, may be a bug here."
                            )
                    else:
                        raise ValueError(
                            "Data dir too short, can not delete. ")
                else:
                    raise ValueError(
                        "dataset name contains '/' or length too short.")
Exemple #3
0
    def run(self) -> None:
        logger.info("[MonitorThread] loop running...")
        while 1:
            time.sleep(1)
            # 1. select all running models
            models = self.experiment_service.find_running_model()

            # 2. check process of running model
            self.handle_models(models)
Exemple #4
0
    def prepare(self):
        # open temporary file
        self.temporary_file_path = util.temporary_upload_file_path('upload_chunk')
        if not P.exists(P.dirname(self.temporary_file_path)):
            os.makedirs(P.dirname(self.temporary_file_path))

        self.temporary_file = open(self.temporary_file_path, 'wb')  # todo limit max length of file
        logger.info(f"Open path {self.temporary_file_path} to store upload file.")
        self.start_time = time.time()
        self.writed_size = 0
Exemple #5
0
def start_server():
    # 1. create web app
    application = CookaWebApplication(consts.PATH_DATABASE)
    application.listen(consts.SERVER_PORT)

    # 2. start thread
    pm = ProcessMonitor()
    pm.start()

    # 3. start io loop
    logger.info(f"Cooka running at: http://0.0.0.0:{consts.SERVER_PORT}")
    tornado.ioloop.IOLoop.instance().start()
Exemple #6
0
    def predict(self, dataset_name, model_name, req_dict: dict):
        # 1. read params
        file_path = util.require_in_dict(req_dict, 'file_path', str)
        reserved_cols = util.get_from_dict(req_dict, 'reserved_cols', list)
        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        if reserved_cols is None or len(reserved_cols) < 1:
            reserved_cols_str = ""
        else:
            reserved_cols_str = ",".join(reserved_cols)

        # 2.  check params
        with db.open_session() as s:
            self.model_dao.require_by_name(s, model_name).to_model_bean()
            dataset_stats = self.dataset_dao.require_by_name(
                s, dataset_name).to_dataset_stats()
        predict_job_name = util.predict_job_name(dataset_name)
        abs_file_path = P.join(consts.DATA_DIR, file_path)
        if not P.exists(abs_file_path):
            raise ValueError(f"Input file not exists: {abs_file_path}")
        if not P.isfile(abs_file_path):
            raise ValueError(f"Input file is not file: {abs_file_path}")

        # 3. add upload step
        upload_extension = {"file_size": P.getsize(abs_file_path)}
        upload_step = JobStep(type=PredictStepType.Upload,
                              status=JobStep.Status.Succeed,
                              took=upload_took,
                              datetime=util.get_now_long(),
                              extension=upload_extension)
        self.add_predict_process_step(model_name, predict_job_name,
                                      upload_step)

        # 4.  execute command
        model_dir = util.model_dir(dataset_name, model_name)
        predict_log_path = P.join(model_dir, f"{predict_job_name}.log")
        if not dataset_stats.has_header:
            default_headers = ",".join(
                [f.name for f in dataset_stats.features])
        else:
            default_headers = None

        command = f"nohup {sys.executable} {consts.PATH_INSTALL_HOME}/cooka/core/batch_predict_job.py --input_file_path={abs_file_path} --reserved_cols={reserved_cols_str} --model_name={model_name} --dataset_name={dataset_name} --job_name={predict_job_name} --has_header={dataset_stats.has_header} --default_headers={default_headers}  --server_portal={consts.SERVER_PORTAL} 1>{predict_log_path} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {predict_log_path}")
        os.system(command)  # ha ha ha

        return predict_job_name
Exemple #7
0
    async def get(self, path, **kwargs):

        if path in self.MissingResource:
            raise tornado.web.HTTPError(404, f"File {path} is missing")

        if path in ['', '/']:
            resource_path = "index.html"
        else:
            absolute_path = self.get_absolute_path(self.root, self.parse_url_path(path))
            if not P.exists(absolute_path):
                logger.info(f"URI {path} not found, use index.html instead ")
                resource_path = "index.html"  # handle 404
            else:
                resource_path = path
        await super(AssetsHandler, self).get(resource_path)
Exemple #8
0
    def __init__(self, database_path):
        # 1. init handlers
        handlers = self.init_handlers()

        # 2. check database
        if not P.exists(database_path):
            database_dir = P.dirname(database_path)
            if not P.exists(database_dir):
                os.makedirs(database_dir, exist_ok=True)

            initialize_database()
            logger.info(f"Initialize database file {database_path}")

        static_path = P.join(P.dirname(P.abspath(__file__)), 'assets')
        # super(CookaApp, self).__init__(handlers, debug=True, static_path=static_path, static_url_prefix='/')
        super(CookaWebApplication, self).__init__(handlers, debug=False)
def handle_tornado_upload_file(http_handler, tornado_http_files,
                               upload_start_time):
    # 1. check and read param
    tornado_http_file = tornado_http_files.get("file")[0]

    if tornado_http_file is None:
        raise MissingParamException("file")

    file_name = tornado_http_file['filename']
    file_body = tornado_http_file['body']
    file_size = util.human_data_size(len(file_body))
    file_suffix = util.get_file_suffix(file_name)

    assert file_suffix in [
        '.csv', '.tsv'
    ], 'Please check is your file suffix in [.csv, .tsv], current is: %s' % file_suffix

    origin_file_name = util.make_dataset_name(util.cut_suffix(
        file_name)) + file_suffix  # for it in url, disk path readable

    # 2. open temporary file and  write to local file
    temporary_file_path = util.temporary_upload_file_path(origin_file_name)

    if not P.exists(P.dirname(temporary_file_path)):
        os.makedirs(P.dirname(temporary_file_path))

    logger.info(f"Open path {temporary_file_path} to store upload file.")

    with open(temporary_file_path, 'wb') as f:
        f.write(file_body)
    logger.info(
        f"Uploaded file finished at {temporary_file_path}, file size {file_size} ."
    )

    upload_took = util.time_diff(time.time(), upload_start_time)

    # 3. response
    # relative_path = temporary_file_path[len(consts.PATH_DATA_ROOT)+1:]  # relative path not start with /
    response = \
        {
            "path": util.relative_path(P.abspath(temporary_file_path)),
            "size": file_size,
            "took": upload_took
        }
    http_handler.response_json(response)
Exemple #10
0
def callback(url, type, status, took, extension, **kwargs):
    req_body_dict = \
        {
            "type": type,
            "status": status,
            "took": took,
            "datetime": util.get_now_long(),
            "extension": extension
        }

    req_body = util.dumps(req_body_dict)
    logger.info(f"Send process event: \n{url}\n{req_body}")
    # Note: http body should be a bytes or will be encode by "requests" and using iso-8859-1
    response = requests.post(url,
                             data=req_body.encode('utf-8'),
                             timeout=TIMEOUT,
                             headers=HEADERS)
    _checkout_response_json(response)
Exemple #11
0
    def post(self, *args, **kwargs):
        # 1. close file
        file_size = util.human_data_size(self.writed_size)
        logger.info(f"Uploaded file finished at {self.temporary_file_path}, file size {file_size} .")
        if self.temporary_file is not None:
            self.temporary_file.flush()
            self.temporary_file.close()

        content_type = self.request.headers['Content-Type']
        with open(self.temporary_file_path, 'rb') as f:
            fields = content_type.split(";")
            for field in fields:
                k, sep, v = field.strip().partition("=")
                if k == "boundary" and v:
                    from tornado.escape import utf8
                    files = {}
                    httputil.parse_multipart_form_data(utf8(v), f.read(), {}, files)
                    handle_tornado_upload_file(self, files, self.start_time)
                    return
        raise Exception("Handle upload failed.")
    def recommended_train_configuration(self, dataset_name, req_dict):
        datetime_series_col = None  # todo support datetime_series
        target_col = req_dict.get('target_col')

        # 1. read last train-job
        with db.open_session() as s:
            # 1.1. check dataset name
            dataset_stats = self.dataset_dao.require_by_name(s, dataset_name).to_dataset_stats()
            # 1.2. query models
            last_model = self.model_dao.checkout_one(self.model_dao.find_by_dataset_name(s, dataset_name, 1, 1)[0])

        # 2. infer conf
        if last_model is None:
            experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col)
        else:
            _experiment_conf: ExperimentConf = self.get_recommended_conf_from_history(last_model)
            if target_col is not None and _experiment_conf.label_col != target_col:
                log.info(f"Change label from {_experiment_conf.label_col} to {target_col}, " +
                         f"and recommended params using new target.")
                experiment_conf = self.get_recommended_conf_as_new(dataset_name, dataset_stats, target_col)
            else:
                # use history
                experiment_conf = _experiment_conf

        train_validation_holdout = experiment_conf.train_validation_holdout
        cross_validation = experiment_conf.cross_validation

        conf = \
            {
                "label_col": experiment_conf.label_col,
                "task_type": experiment_conf.task_type,
                "pos_label": experiment_conf.pos_label,
                "train_mode": experiment_conf.train_mode,
                "partition_strategy": ExperimentConf.PartitionStrategy.TrainValidationHoldout,
                "train_validation_holdout": train_validation_holdout.to_dict() if train_validation_holdout is not None else None,
                "cross_validation": cross_validation.to_dict() if cross_validation is not None else None,
                "datetime_series_col": datetime_series_col
            }
        return conf
    def run_train_job(self, framework, conf: ExperimentConf, no_experiment: int, model_input_features:list, n_rows: int):

        # 1. create train conf
        job_name = f"train_job_{conf.dataset_name}_{framework}_{util.human_datetime()}"

        brevity_framework_dict = {FrameworkType.DeepTables: "dt", FrameworkType.GBM: "gbm"}

        model_name = util.model_name(conf.dataset_name, no_experiment)  #f"{conf.dataset_name}_{no_experiment}"

        model_dir = util.model_dir(conf.dataset_name, model_name)
        os.makedirs(model_dir)
        train_source_code_path = P.join(model_dir, 'train.py')
        train_log = P.join(model_dir, f"train.log")

        train_job_conf = TrainJobConf(framework=framework,
                                      name=job_name,
                                      model_name=model_name,
                                      searcher=TrainJobConf.Searcher.MCTSSearcher,
                                      max_trails=consts.TRAIN_MODE_MAX_TRAILS_MAPPING[conf.train_mode],
                                      search_space=TrainJobConf.SearchSpace.Minimal)
        # 2. insert to db
        with db.open_session() as s:
            self.create_temporary_model(s, model_name, no_experiment, model_input_features, conf,  train_job_conf)

        # 3. generate train source code
        train_source_code, notebook_content = self.generate_code(model_name, model_input_features, n_rows, train_job_conf, conf)

        with open(train_source_code_path, 'w', encoding='utf-8') as f:
            f.write(train_source_code)

        notebook_file_path = P.join(model_dir, 'train.ipynb')
        with open(notebook_file_path, 'w', encoding='utf-8') as f:
            f.write(notebook_content)

        # 4. run train process
        # Note: if plus & at end of command, the process id will be plus 1 cause a bug
        command = f"nohup {sys.executable} {train_source_code_path} 1>{train_log} 2>&1"

        log.info(f"Run train job command: \n{command}")
        log.info(f"Log file:\ntail -f  {train_log}")
        log.info(f"Train source code:\n {train_source_code_path}")

        train_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE)

        with db.open_session() as s:
            self.model_dao.update_model_by_name(s, model_name, {"pid": train_process.pid})

        return train_job_conf.to_dict()
    def _handle_label_col(self, dataset_name, label_col, file_path):
        # calc correlation
        # 1. update label col, Avoiding that http request send first and not label_col not updated
        with db.open_session() as s:
            self.dataset_dao.update_by_name(s, dataset_name, {"label_col": label_col})

        # 2. start a process
        analyze_pearson_job_name = util.analyze_data_job_name(P.basename(file_path))
        std_log = P.join(util.dataset_dir(dataset_name), f"{analyze_pearson_job_name}.log")

        command = f"nohup {sys.executable} {util.script_path('analyze_correlation_job.py')} --dataset_name={dataset_name} --label_col={label_col} --job_name={analyze_pearson_job_name} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1"
        calc_correlation_process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE)

        log.info(f"Run calculate pearson command: \n{command}")
        log.info(f"Log file:\ntail -f  {std_log}")
        log.info(f"Process id is {calc_correlation_process.pid}")
 def _infer_task_type(self, f:Feature):
     n_unique = f.unique.value
     if n_unique == 2:
         log.info(f'2 class detected,  so inferred as a [binary classification] task')
         return TaskType.BinaryClassification
     else:
         if 'float' in f.data_type:
             log.info(f'Target column type is float, so inferred as a [regression] task.')
             return TaskType.Regression
         else:
             if n_unique > 1000:
                 if 'int' in f.type:
                     log.info(
                         'The number of classes exceeds 1000 and column type is int, so inferred as a [regression] task ')
                     return TaskType.Regression
                 else:
                     raise ValueError(
                         'The number of classes exceeds 1000, please confirm whether your predict target is correct ')
             else:
                 print(f'{n_unique} class detected, inferred as a [multiclass classification] task')
                 return TaskType.MultiClassification
                   dataset_stats.has_header, dataset_stats.features_names)
df = dataset_util.cast_df(df, dataset_detail['features'], True)
y = df[label_col]

# 4. encode y if is categorical # Do not calculate if categorical
# for f in dataset_stats.features:
#     if f.name == label_col:
#         if f.type == FeatureType.Categorical:
#             logger.info(f"Encode label column {label_col} because type is {f.type}. ")
#             y = pd.Series(LabelEncoder().fit_transform(y), name=label_col)

# 5. encode categorical features
pearson_corr_dict = {}
for f in dataset_stats.features:
    if f.type == FeatureType.Categorical:
        logger.info(f"Skip categorical feature {f.name} ")
        # lb = LabelEncoder()
        # encoded_series = pd.Series(lb.fit_transform(df[f.name]), name=f.name)
        # pearson_corr_dict[f.name] = y.corr(encoded_series, method='pearson')
        pearson_corr_dict[f.name] = None

    elif f.type in [FeatureType.Continuous, FeatureType.Datetime]:
        pearson_corr_dict[f.name] = y.corr(df[f.name], method='pearson')
    else:
        logger.info(
            f"Encode feature {f.name} type is {f.type}, skipped calc corr. ")
        pearson_corr_dict[f.name] = None  # not support text feature

extension = {"corr": pearson_corr_dict, "label_col": label_col}

# 6. send back calc result
# 2. retrieve dataset info
dataset_detail = client.retrieve_dataset(server_portal, dataset_name)
dataset_stats = DatasetStats.load_dict(dataset_detail)

# 3. read df

df = util.read_csv(util.abs_path(dataset_stats.file_path),
                   dataset_stats.has_header, dataset_stats.features_names)
df = dataset_util.cast_df(df, dataset_detail['features'], True)
y = df[label_col]

# 4. encode y if is categorical
for f in dataset_stats.features:
    if f.name == label_col:
        if f.type == FeatureType.Categorical:
            logger.info(
                f"Encode label column {label_col} because type is {f.type}. ")
            y = pd.Series(LabelEncoder().fit_transform(y), name=label_col)

# 5. encode categorical features
pearson_corr_dict = {}
for f in dataset_stats.features:
    if f.type == FeatureType.Categorical:
        lb = LabelEncoder()
        logger.info(f"Encode categorical feature {f.name} ")
        encoded_series = pd.Series(lb.fit_transform(df[f.name]), name=f.name)
        pearson_corr_dict[f.name] = y.corr(encoded_series, method='pearson')
    elif f.type in [FeatureType.Continuous, FeatureType.Datetime]:
        pearson_corr_dict[f.name] = y.corr(df[f.name], method='pearson')
    else:
        logger.info(
            f"Encode feature {f.name} type is {f.type}, skipped calc corr. ")
    def experiment(self, req_dict: dict):
        # 1. read params
        label_col = util.require_in_dict(req_dict, 'label_col', str)
        pos_label = util.get_from_dict(req_dict, 'pos_label', object)
        train_mode = util.get_from_dict(req_dict, 'train_mode', str)

        partition_strategy = util.require_in_dict(req_dict, 'partition_strategy', str)
        dataset_name = util.require_in_dict(req_dict, 'dataset_name', str)

        holdout_percentage = util.require_in_dict(req_dict, 'holdout_percentage', int)

        # todo check datetime_series_col
        datetime_series_col = util.get_from_dict(req_dict, 'datetime_series_col', str)

        experiment_engine = util.require_in_dict(req_dict, 'experiment_engine', str)
        if experiment_engine not in [FrameworkType.GBM, FrameworkType.DeepTables]:
            raise ValueError(f"Unseen experiment_engine {experiment_engine}")

        # 2. check partition_strategy
        cross_validation = None
        train_validation_holdout = None
        if partition_strategy == ExperimentConf.PartitionStrategy.CrossValidation:
            cross_validation_dict = util.require_in_dict(req_dict, 'cross_validation', dict)
            n_folds = util.require_in_dict(cross_validation_dict, 'n_folds', int)
            if 1 < n_folds <= 50:
                cross_validation = CrossValidation(n_folds=n_folds, holdout_percentage=holdout_percentage)
            else:
                raise ValueError(f"1 < n_folds <= 50 but current is: {n_folds}")
        elif partition_strategy == ExperimentConf.PartitionStrategy.TrainValidationHoldout:
            train_validation_holdout_dict = util.require_in_dict(req_dict, 'train_validation_holdout', dict)
            train_percentage = util.require_in_dict(train_validation_holdout_dict, 'train_percentage', int)
            validation_percentage = util.require_in_dict(train_validation_holdout_dict, 'validation_percentage', int)
            if train_percentage + validation_percentage + holdout_percentage != 100:
                raise ValueError("train_percentage plus validation_percentage plus holdout_percentage should equal 100.")
            train_validation_holdout = TrainValidationHoldout(train_percentage=train_percentage, validation_percentage=validation_percentage, holdout_percentage=holdout_percentage)
        else:
            raise ValueError(f"Unknown partition strategy = {partition_strategy}")

        # 2. Retrieve data
        with db.open_session() as s:
            # 2.1. check dataset
            dataset = self.dataset_dao.require_by_name(s, dataset_name)
            if dataset is None:
                raise ValueError(f"Dataset={dataset_name} not exists.")
            dataset_stats = dataset.to_dataset_stats()

            # 2.2. generate new experiment name
            no_experiment = self.model_dao.get_max_experiment(s, dataset_name) + 1

        # 3. ensure dataset label is latest
        if dataset_stats.label_col is None:
            log.info(f"Dataset {dataset_name} label_col not set now, update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        if dataset_stats.label_col != label_col:
            log.info(f"Dataset {dataset_name} label_col current is {dataset_stats.label_col}, but this experiment update to {label_col}")
            self._handle_label_col(dataset_name, label_col, dataset_stats.file_path)

        # 4. calc task type
        # 4.1. find label
        label_f = self._find_feature(dataset_stats.features, label_col)
        if label_f is None:
            raise ValueError(f"Label col = {label_col} is not in dataset {dataset_name} .")

        task_type = self._infer_task_type(label_f)

        # 4.2. check pos_label
        if task_type == TaskType.BinaryClassification:
            if pos_label is None:
                raise ValueError("Pos label can not be None when it's binary-classify")
            else:
                if isinstance(pos_label, str):
                    if len(pos_label) < 1:
                        raise ValueError("Pos label can not be empty when it's binary-classify")

        # 5. run experiment
        if not dataset_stats.has_header:
            dataset_default_headers = [f.name for f in dataset_stats.features]
        else:
            dataset_default_headers = None

        conf = ExperimentConf(dataset_name=dataset_name,
                              dataset_has_header=dataset_stats.has_header,
                              dataset_default_headers=dataset_default_headers,
                              train_mode=train_mode,
                              label_col=label_col,
                              pos_label=pos_label,
                              task_type=task_type,
                              partition_strategy=partition_strategy,
                              cross_validation=cross_validation,
                              train_validation_holdout=train_validation_holdout,
                              datetime_series_col=datetime_series_col,
                              file_path=dataset_stats.file_path)

        model_input_features = list(map(lambda _: ModelFeature(name=_.name, type=_.type, data_type=_.data_type).to_dict(), filter(lambda _: _.name != label_f.name, dataset_stats.features)))

        if experiment_engine == FrameworkType.GBM:
            train_conf = self.run_train_job(FrameworkType.GBM, conf,
                                            no_experiment, model_input_features,  dataset_stats.n_rows)
        else:
            train_conf = self.run_train_job(FrameworkType.DeepTables, conf, no_experiment, model_input_features,
                                               dataset_stats.n_rows)

        return {
            "no_experiment": no_experiment,
            "experiment_conf": conf.to_dict(),
            "train_job_conf": train_conf
        }
Exemple #19
0
    def create_temporary_dataset(self, req_dict):
        sample_strategy = util.require_in_dict(req_dict, 'sample_strategy',
                                               str, 'random_rows')
        if SampleConf.Strategy.Percentage == sample_strategy:
            percentage = util.get_from_dict(req_dict, 'percentage', int, 30)
            n_rows = None
        elif SampleConf.Strategy.RandomRows == sample_strategy:
            n_rows = util.get_from_dict(req_dict, 'n_rows', int, 1000)
            percentage = None
        elif SampleConf.Strategy.WholeData == sample_strategy:
            n_rows = None
            percentage = None
        else:
            raise ValueError(f"Not support sample strategy: {sample_strategy}")

        upload_took = util.require_in_dict(req_dict, 'upload_took', float)

        file_path = util.require_in_dict(req_dict, 'file_path', str)
        source_type = util.require_in_dict(req_dict, 'source_type', str)

        sample_conf = SampleConf(sample_strategy=sample_strategy,
                                 percentage=percentage,
                                 n_rows=n_rows)

        # 1. validate param
        if source_type not in [
                DatasetEntity.SourceType.Upload,
                DatasetEntity.SourceType.Import
        ]:
            raise IllegalParamException(
                'source_type', source_type,
                f'Should in {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}'
            )

        if source_type == DatasetEntity.SourceType.Upload:
            upload_file_prefix = P.join(consts.FIELD_TMP, consts.FIELD_UPLOAD)
            if not file_path.startswith(upload_file_prefix):
                raise ValueError(
                    f"For upload file should path should start with {upload_file_prefix} but it's {file_path}"
                )
            else:
                # fix relative path
                file_path = P.join(consts.DATA_DIR, file_path)

        if not P.exists(file_path):
            raise ValueError(f"File={file_path} not exists")

        if not P.isfile(file_path):
            raise ValueError(f"File={file_path} is not a file")

        util.validate_sample_conf(sample_conf)

        # 2. create
        if source_type == DatasetEntity.SourceType.Upload:
            return self._create_temporary_dataset(source_type, file_path,
                                                  upload_took, sample_conf)
        elif source_type == DatasetEntity.SourceType.Import:
            t1 = time.time()
            internal_path = util.temporary_upload_file_path(
                P.basename(file_path))
            os.makedirs(P.dirname(internal_path), exist_ok=True)
            shutil.copy(file_path, internal_path)
            took = time.time() - t1
            logger.info(f"Copy file to {internal_path}")
            return self._create_temporary_dataset(source_type, internal_path,
                                                  took, sample_conf)
        else:
            raise IllegalParamException(
                'source_type', source_type,
                f'should one of {",".join([DatasetEntity.SourceType.Upload, DatasetEntity.SourceType.Import])}'
            )
Exemple #20
0
    def _create_temporary_dataset(self, source_type, file_path, took,
                                  sample_conf: SampleConf):
        now = util.get_now_datetime()
        file_name = P.basename(file_path)
        temporary_dataset_name = self.choose_temporary_dataset_name(
            file_name)  # use a long name
        analyze_job_name = util.analyze_data_job_name(
            util.cut_suffix(file_name), now)
        file_size = P.getsize(file_path)

        # 2. create record
        td = DatasetEntity(name=temporary_dataset_name,
                           file_size=file_size,
                           is_temporary=True,
                           status=DatasetEntity.Status.Created,
                           source_type=source_type,
                           file_path=file_path,
                           file_name=file_name,
                           create_datetime=now,
                           last_update_datetime=now)
        with db.open_session() as s:
            s.add(td)

        # 3. send  file transfer step
        if source_type == DatasetEntity.SourceType.Upload:
            step = JobStep(type=AnalyzeStep.Types.Upload,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)
        elif source_type == DatasetEntity.SourceType.Import:
            step = JobStep(type=AnalyzeStep.Types.Copy,
                           status=AnalyzeStep.Status.Succeed,
                           extension={
                               "file_size": file_size,
                               "file_path": file_path
                           },
                           took=took,
                           datetime=util.get_now_long())
            self.add_analyze_process_step(temporary_dataset_name,
                                          analyze_job_name, step)

        # 4. create analyze config
        conf = AnalyzeJobConf(job_name=analyze_job_name,
                              dataset_name=temporary_dataset_name,
                              sample_conf=sample_conf,
                              path=file_path,
                              temporary_dataset=True,
                              label_col=None)

        # 5. start new process
        analyze_config_string = util.dumps(conf.to_dict())
        logger.info(f"Analyze job conf: {analyze_config_string}")

        python_executable = sys.executable

        temporary_dataset_dir = util.temporary_dataset_dir(
            temporary_dataset_name)

        os.makedirs(temporary_dataset_dir, exist_ok=True)

        std_log = P.join(temporary_dataset_dir, f"{analyze_job_name}.log")

        command = f"nohup {python_executable} {util.script_path('analyze_job.py')} --file_path={file_path} --job_name={analyze_job_name} --dataset_name={temporary_dataset_name} --sample_strategy={sample_conf.sample_strategy} --n_rows={self.replace_None(sample_conf.n_rows)} --percentage={self.replace_None(sample_conf.percentage)} --server_portal={consts.SERVER_PORTAL} 1>{std_log} 2>&1 &"

        logger.info(f"Run analyze job command: \n{command}")
        logger.info(f"Log file:\ntail -f {std_log}")

        # JobManager.instance().run_job(job)
        os.system(command)  # ha ha ha

        return temporary_dataset_name, analyze_job_name