def test_creator(self): test_data = dsutils.load_blood() eval_data = dsutils.load_blood() make_options = { 'test_data': test_data, 'eval_data': eval_data, "task": const.TASK_BINARY, 'target': "Class", 'feature_selection': True, 'feature_selection_strategy': "threshold", "evaluation_metrics": "auto", "evaluation_persist_prediction": True, "report_render": 'excel', "search_space": PlainSearchSpace(), } job_working_dir = common_util.get_temp_dir_path( prefix="hyn_job_creator_test_") exp = BloodDatasetJobEngine().create_experiment_with_params( make_options, job_working_dir) assert exp assert isinstance(exp, CompeteExperiment) run_options = {"max_trials": 2} exp.run(**run_options) assert (Path(job_working_dir) / "report.xlsx").exists()
def foo(): fn_foo(1, 2, k1='lalala') fn_foo('dict', {'a': 'aaa', 'b': 345}) fn_foo('list', list(range(5))) fn_foo('big-list', list(range(100))) fn_foo('big-range', range(100)) fn_foo('df', dsutils.load_blood()) fn_foo('ndarray', dsutils.load_blood().values) fn_foo('fn', foo) fn_foo('lambda', lambda: print('lambda')) fn_foo(['aaa', 3, 4, ['aaa', 'bbb']], 2, k2='lalala')
def test_experiment_with_blood_full_features(): df = dsutils.load_blood() target = 'Class' df_train, df_test = train_test_split(df, train_size=0.8, random_state=335) df_test.pop(target) experiment = make_experiment( PlainModel, df, target=target, search_space=PlainSearchSpace(), test_data=df_test, feature_generation=True, collinearity_detection=True, drift_detection=True, feature_selection=True, down_sample_search=True, down_sample_search_size=0.2, feature_reselection=True, pseudo_labeling=True, random_state=335, early_stopping_time_limit=1200, # log_level='info', ) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None step_names = [step[0] for step in estimator.steps] assert step_names == [ StepNames.DATA_CLEAN, StepNames.MULITICOLLINEARITY_DETECTION, 'estimator' ]
def setup_class(cls): if is_dask_installed: import dask.dataframe as dd setup_dask(cls) cls.boston = dd.from_pandas(dsutils.load_boston(), npartitions=1) cls.blood = dd.from_pandas(dsutils.load_blood(), npartitions=1) cls.bike_sharing = dd.from_pandas(dsutils.load_Bike_Sharing(), npartitions=1)
def test_experiment_with_blood_simple(): df = dsutils.load_blood() experiment = make_experiment(PlainModel, df, target='Class', search_space=PlainSearchSpace()) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None
def experiment_with_blood(self, init_kwargs, run_kwargs, row_count=3000, with_dask=False): if with_dask: X = self.blood.copy() y = X.pop('Class') else: X = dsutils.load_blood() if row_count is not None: X = X.head(row_count) X['Class'] = LabelEncoder().fit_transform(X['Class']) y = X.pop('Class') hyper_model = create_plain_model(with_encoder=True) tb = get_tool_box(X, y) X_train, X_test, y_train, y_test = \ tb.train_test_split(X, y, test_size=0.3, random_state=9527) X_train, X_eval, y_train, y_eval = \ tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527) init_kwargs = { 'X_eval': X_eval, 'y_eval': y_eval, 'X_test': X_test, **init_kwargs } compete_experiment = CompeteExperiment(hyper_model, X_train, y_train, **init_kwargs) base_experiment = Experiment(hyper_model, X_train, y_train, **init_kwargs) mydict_compete = compete_experiment.get_data_character() mydict_base = base_experiment.get_data_character() assert mydict_base assert mydict_compete assert mydict_base['experimentType'] == 'base' assert mydict_compete['experimentType'] == 'compete' assert mydict_base['target']['taskType'] == 'binary' assert mydict_base['target']['freq'] is not None assert mydict_base['target']['unique'] is 2 assert mydict_base['target']['mean'] is None assert mydict_base['target']['max'] is None assert mydict_base['target']['min'] is None assert mydict_base['target']['stdev'] is None assert mydict_base['target']['dataType'] assert len(mydict_base['targetDistribution']) <= 10 assert mydict_base['datasetShape']['X_train'] assert mydict_base['datasetShape']['y_train'] assert mydict_base['datasetShape']['X_eval'] assert mydict_base['datasetShape']['y_eval'] assert mydict_base['datasetShape']['X_test'] assert mydict_compete['featureDistribution']
def setup_class(cls): from sklearn.preprocessing import LabelEncoder df = dsutils.load_bank() df['y'] = LabelEncoder().fit_transform(df['y']) # binary task target df['education'] = LabelEncoder().fit_transform( df['education']) # multiclass task target cls.bank_data = df cls.bank_data_cudf = cudf.from_pandas(df) cls.boston_data = dsutils.load_blood() cls.boston_data_cudf = cudf.from_pandas(cls.boston_data) cls.movie_lens = dsutils.load_movielens() os.makedirs(cls.work_dir)
def test_experiment_with_blood_down_sample(): df = dsutils.load_blood() experiment = make_experiment( PlainModel, df, target='Class', search_space=PlainSearchSpace(), down_sample_search=True, down_sample_search_size=0.1, down_sample_search_time_limit=300, down_sample_search_max_trials=10, # log_level='info', ) estimator = experiment.run(max_trials=3) print(estimator) assert estimator is not None
def run_export_excel_report(maker, has_eval_data=True, str_label=True): df = dsutils.load_blood() df['Constant'] = [0 for i in range(df.shape[0])] df['Id'] = [i for i in range(df.shape[0])] target = 'Class' labels = ["no", "yes"] if str_label: df[target] = df[target].map(lambda v: labels[v]) df_train, df_eval = train_test_split(df, test_size=0.2) df_train['Drifted'] = np.random.random(df_train.shape[0]) df_eval['Drifted'] = np.random.random(df_eval.shape[0]) * 100 file_path = common_util.get_temp_file_path(prefix="report_excel_", suffix=".xlsx") print(file_path) experiment = maker(df_train, target, df_eval, file_path) estimator = experiment.run(max_trials=3) assert estimator is not None mlr_callback = None mle_callback = None for callback in experiment.callbacks: if isinstance(callback, MLReportCallback): mlr_callback = callback if isinstance(callback, MLEvaluateCallback): mle_callback = callback assert mlr_callback is not None _experiment_meta: ExperimentMeta = mlr_callback.experiment_meta_ assert len(_experiment_meta.resource_usage) > 0 assert os.path.exists(file_path) if has_eval_data: assert mle_callback is not None assert _experiment_meta.confusion_matrix is not None assert _experiment_meta.classification_report is not None assert len(_experiment_meta.prediction_elapsed) == 2 assert _experiment_meta.confusion_matrix.data.shape == ( 2, 2) # binary classification assert len(_experiment_meta.datasets) == 3 else: assert len(_experiment_meta.datasets) == 2 return _experiment_meta
def _create_experiment(self, make_options): from hypernets.experiment import make_experiment train_data = dsutils.load_blood() experiment = make_experiment(PlainModel, train_data, **make_options) return experiment