Ejemplo n.º 1
0
 def get_hash(self):
     assert self.column_descriptions is not None
     m = hashlib.md5()
     get_hash_of_str(self.dataset_type, m)
     get_hash_of_str(self.dataset_source, m)
     get_hash_of_str(str(list(self.columns)), m)
     get_hash_of_dict(self.column_descriptions, m)
     return get_hash_of_dataframe(self.data, m)
Ejemplo n.º 2
0
 def test_get_hash_of_dict(self):
     o1 = {
         "A": [3, 4, 2],
         "C": {
             "D": [3, 2, 1, "s", False],
             "B": "9"
         },
         "B": []
     }
     o2 = {
         "A": [4, 3, 2],
         "B": [],
         "C": {
             "B": "9",
             "D": [False, 3, 2, 1, "s"],
         },
     }
     o3 = {
         "A": [4, 3, 1],
         "B": [],
         "C": {
             "B": "9",
             "D": [False, 3, 2, 1, "s"],
         },
     }
     self.assertEqual(get_hash_of_dict(o1), get_hash_of_dict(o2))
     self.assertEqual(get_hash_of_dict(o1), get_hash_of_dict(o1))
     self.assertNotEqual(get_hash_of_dict(o1), get_hash_of_dict(o3))
     self.assertNotEqual(get_hash_of_dict(o2), get_hash_of_dict(o3))
Ejemplo n.º 3
0
 def insert_to_hdls_table(self, hdl):
     self.init_hdls_table()
     hdl_hash = get_hash_of_dict(hdl)
     hdl_id = hdl_hash
     records = self.HDLsModel.select().where(
         self.HDLsModel.hdl_id == hdl_id)
     if len(records) == 0:
         self.HDLsModel.create(hdl_id=hdl_id, hdl=hdl)
     self.hdl_id = hdl_id
Ejemplo n.º 4
0
 def insert_to_hdls_table(self, hdl, hdl_metadata):
     self.init_hdls_table()
     hdl_hash = get_hash_of_dict(hdl)
     hdl_id = hdl_hash
     records = self.HDLsModel.select().where(
         self.HDLsModel.hdl_id == hdl_id)
     if len(records) == 0:
         self.HDLsModel.create(hdl_id=hdl_id,
                               hdl=hdl,
                               meta_data=hdl_metadata)
     else:
         old_meta_data = records[0].meta_data
         meta_data = update_data_structure(old_meta_data, hdl_metadata)
         self.HDLsModel(hdl_id=hdl_id, hdl=hdl, meta_data=meta_data).save()
     self.hdl_id = hdl_id
Ejemplo n.º 5
0
 def _fit(self,
          estimator,
          X_train,
          y_train=None,
          X_valid=None,
          y_valid=None,
          X_test=None,
          y_test=None,
          feature_groups=None,
          columns_metadata=None):
     # 保留其他数据集的参数,方便模型拓展
     X = self.prepare_X_to_fit(X_train, X_valid, X_test)
     if self.store_intermediate:
         if self.resource_manager is None:
             print(
                 "warn: no resource_manager when store_intermediate is True"
             )
             fitted_estimator = self.core_fit(estimator, X, y_train,
                                              X_valid, y_valid, X_test,
                                              y_test, feature_groups,
                                              columns_metadata)
         else:
             # get hash value from X, y, hyperparameters
             Xy_hash = get_hash_of_Xy(X, y_train)
             hp_hash = get_hash_of_dict(self.processed_params)
             hash_value = Xy_hash + "-" + hp_hash
             result = self.resource_manager.redis_get(hash_value)
             if result is None:
                 fitted_estimator = estimator.fit(X, y_train)
                 self.resource_manager.redis_set(
                     hash_value, pickle.dumps(fitted_estimator))
             else:
                 fitted_estimator = pickle.loads(result)
     else:
         fitted_estimator = self.core_fit(estimator, X, y_train, X_valid,
                                          y_valid, X_test, y_test,
                                          feature_groups, columns_metadata)
     self.resource_manager = None  # avoid can not pickle error
     return fitted_estimator
Ejemplo n.º 6
0
    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_test=None, y_test=None, fit_final_estimator=True,
            resouce_manager=None):
        # set default `self.last_data` to prevent exception in only classifier cases
        self.last_data = {
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "X_valid": X_valid,
        }
        time_cost_list = []
        # if `is_estimator` or `fit_final_estimator` is True, `with_final` is False
        for (step_idx, step_name, transformer) in self._iter(
                with_final=(not (fit_final_estimator and self.is_estimator)),
                filter_passthrough=False):
            # todo : 做中间结果的存储
            cache_intermediate = False
            hyperparams = {}
            hit_cache = False
            dataset_id = None
            cache_key = None
            start_time = time()
            if getattr(transformer, "cache_intermediate", False):
                if self.resource_manager is None:
                    self.logger.warning(
                        f"In ML Workflow step '{step_name}', 'cache_intermediate' is set to True, but resource_manager is None.")
                else:
                    hyperparams = getattr(transformer, "hyperparams")
                    if not isinstance(hyperparams, dict):
                        self.logger.warning(f"In ML Workflow step '{step_name}', transformer haven't 'hyperparams'.")
                    else:
                        cache_intermediate = True
            if cache_intermediate:
                if hasattr(transformer, "prepare_X_to_fit"):
                    def stack_X_before_fit(X_train, X_valid, X_test, **kwargs):
                        t = transformer
                        X_train_f = t.filter_feature_groups(X_train)
                        X_stack_ = t.prepare_X_to_fit(
                            X_train_f,
                            t.filter_feature_groups(X_valid),
                            t.filter_feature_groups(X_test),
                        )
                        X_stack_pre = X_train_f.copy()
                        X_stack_pre.data = X_stack_
                        return X_stack_pre

                else:
                    def stack_X_before_fit(X_train, X_valid, X_test, **kwargs):
                        self.logger.warning(
                            f"In ML Workflow step '{step_name}', transformer haven't attribute 'prepare_X_to_fit'. ")
                        return X_train

                X_stack_pre = stack_X_before_fit(X_train, X_valid, X_test)
                dataset_id = X_stack_pre.get_hash()
                component_name = transformer.__class__.__name__
                m = hashlib.md5()
                get_hash_of_str(component_name, m)
                get_hash_of_str(str(transformer.in_feature_groups), m)
                get_hash_of_str(str(transformer.out_feature_groups), m)
                component_hash = get_hash_of_dict(hyperparams, m)
                cache_key = f"workflow-{component_hash}-{dataset_id}"
                cache_results = self.resource_manager.cache.get(cache_key)
                if cache_results is not None and isinstance(cache_results, dict) \
                        and "X_trans" in cache_results and "component" in cache_results:
                    self.logger.debug(f"workflow cache hit, component_name = {component_name},"
                                      f" dataset_id = {dataset_id}, cache_key = '{cache_key}'")
                    X_trans = cache_results["X_trans"]
                    fitted_transformer = cache_results["component"]  # set the variable in later
                    X_stack = fitted_transformer.get_X_stack(X_train, X_valid, X_test)
                    result = fitted_transformer.assemble_all_result(
                        X_stack, X_trans, X_train, X_valid, X_test, y_train)
                    hit_cache = True
                else:
                    self.logger.debug(f"workflow cache miss, component_name = {component_name},"
                                      f" dataset_id = {dataset_id}, cache_key = '{cache_key}'")
                    fitted_transformer = transformer.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
                    X_stack, X_trans = transformer.transform(X_train, X_valid, X_test, y_train, return_stack_trans=True)
                    result = transformer.assemble_all_result(X_stack, X_trans, X_train, X_valid, X_test, y_train)
                    self.resource_manager.cache.set(
                        cache_key, {
                            "X_trans": X_trans,
                            "component": fitted_transformer
                        }
                    )
                    # todo: 增加一些元信息
            else:
                fitted_transformer = transformer.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
                result = transformer.transform(X_train, X_valid, X_test, y_train)
            if self.resource_manager.should_record_workflow_step:
                self.resource_manager.insert_workflow_step_record(
                    config_id=self.config_id,
                    experiment_id=self.resource_manager.experiment_id,
                    config=self.config,
                    step_idx=step_idx,
                    step_name=step_name,
                    component_name=transformer.__class__.__name__,
                    hyperparams=hyperparams,
                    dataset_id=dataset_id,
                    hit_cache=hit_cache,
                    cache_key=cache_key
                )
            X_train = result["X_train"]
            X_valid = result.get("X_valid")
            X_test = result.get("X_test")
            y_train = result.get("y_train")
            if self.should_store_intermediate_result:
                current_dict = {}
                self.update_data_container_to_dataset_id(step_name, "X_train", X_train, current_dict)
                self.update_data_container_to_dataset_id(step_name, "X_valid", X_valid, current_dict)
                self.update_data_container_to_dataset_id(step_name, "X_test", X_test, current_dict)
                self.update_data_container_to_dataset_id(step_name, "y_train", y_train, current_dict)
                self.intermediate_result.update({step_name: current_dict})
            self.last_data = result
            self.steps[step_idx] = (step_name, fitted_transformer)
            cost_time = time() - start_time
            time_cost_list.append([
                step_idx,
                step_name,
                transformer.__class__.__name__,
                hit_cache,
                cost_time,
            ])

        if (fit_final_estimator and self.is_estimator):
            # self._final_estimator.resource_manager = self.resource_manager
            start_time = time()
            self._final_estimator.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
            cost_time = time() - start_time
            time_cost_list.append([
                len(self.steps),
                self.steps[-1][0],
                self._final_estimator.__class__.__name__,
                False,
                cost_time
            ])
            # self._final_estimator.resource_manager = None
        self.fitted = True
        self.time_cost_list = time_cost_list
        return self