def predict_proba(self, X, *, ingore_transformer=False, **kwargs): eval_set = kwargs.pop('eval_set', None) # ignore if not ingore_transformer and self.transformer is not None: logger.info('transform data') X = self.transformer.transform(X) tb_original = get_tool_box(X) X, = tb_original.to_local(X) tb = get_tool_box(X) if self.cv_models_: proba_sum = None for n, est in enumerate(self.cv_models_): logger.info(f'predict_proba estimator {n}') proba = est.predict_proba(X, **kwargs) if self.task == const.TASK_BINARY: proba = tb.fix_binary_predict_proba_result(proba) if proba_sum is None: proba_sum = proba else: proba_sum += proba proba = proba_sum / len(self.cv_models_) else: logger.info('predict_proba') proba = self.model.predict_proba(X, **kwargs) if self.task == const.TASK_BINARY: proba = tb.fix_binary_predict_proba_result(proba) proba, = tb_original.from_local(proba) return proba
def test_get_tool_box(self): tb = get_tool_box(dd.DataFrame) assert tb is DaskToolBox ddf = dd.from_pandas(pd.DataFrame( dict(x1=['a', 'b', 'c'], x2=[1, 2, 3])), npartitions=1) tb = get_tool_box(ddf) assert tb is DaskToolBox
def check_dataframe(df1, df2, *, shape=True, columns=True, dtypes=True, values=True, delta=1e-5): from hypernets.tabular import get_tool_box if not isinstance(df1, pd.DataFrame): df1, = get_tool_box(df1).to_local(df1) df1 = pd.DataFrame(df1) if not isinstance(df2, pd.DataFrame): df2, = get_tool_box(df2).to_local(df2) df2 = pd.DataFrame(df2) if shape: assert df1.shape == df2.shape, 'The same dataframe shape is required.' if columns: assert all( df1.columns == df2.columns), 'The same column names were required.' if dtypes: assert df1.dtypes.tolist() == df2.dtypes.tolist( ), 'The same column dtypes were required.' if values: if not columns: df2.columns = df1.columns float_cols = df1.select_dtypes(['float32', 'float64']).columns.tolist() if float_cols: df1_float = df1[float_cols] df2_float = df2[float_cols] value_diff = (df1_float - df2_float).abs().max().max() assert value_diff < delta df1_nofloat = df1[[ c for c in df1.columns.tolist() if c not in float_cols ]] df2_nofloat = df2[[ c for c in df2.columns.tolist() if c not in float_cols ]] else: df1_nofloat = df1 df2_nofloat = df2 if df1_nofloat.shape[1] > 0: assert (df1_nofloat == df2_nofloat ).all().all(), 'all value should be equal.' return True
def test_basic(self): hasher = get_tool_box(pd.DataFrame).data_hasher() df1 = pd.read_csv(io.StringIO(csv_str)) hash1 = hasher(df1) df2 = pd.read_csv(io.StringIO(csv_str)) hash2 = hasher(df2) assert hash1 == hash2 df3 = df1.head(5) hash3 = hasher(df3) assert hash1 != hash3 df4 = pd.concat([df1, df1.head(1)], axis=0) hash4 = hasher(df4) assert hash1 != hash4 df5 = copy.deepcopy(df1) df5['x1_int_nanchar'] = ['1.0', '2.2', '\\N', '4.', '5', '6'] hash5 = hasher(df5) assert hash1 == hash5 df6 = copy.deepcopy(df1) df6['x1_int_nanchar'] = ['2.0', '2.2', '\\N', '4.', '5', '6'] hash6 = hasher(df6) assert hash1 != hash6
def _get_estimator(self, space_sample): from hypernets.tabular import get_tool_box import dask.dataframe as dd estimator = super()._get_estimator(space_sample) return get_tool_box(dd.DataFrame).wrap_local_estimator(estimator)
def predict(self, X, **kwargs): eval_set = kwargs.pop('eval_set', None) # ignore if self.transformer is not None: logger.info('transform local') X = self.transformer.transform(X) logger.info('bring X,y to local') tb_original = get_tool_box(X) X, = tb_original.to_local(X) if self.cv_models_: if self.task == const.TASK_REGRESSION: pred_sum = None for n, est in enumerate(self.cv_models_): logger.info(f'predict estimator {n}') pred = est.predict(X, **kwargs) if pred_sum is None: pred_sum = pred else: pred_sum += pred preds = pred_sum / len(self.cv_models_) else: logger.info('predict_proba') proba = self.predict_proba(X, ingore_transformer=True, **kwargs) logger.info('proba2predict') preds = self.proba2predict(proba) preds = np.array(self.classes_).take(preds, axis=0) else: logger.info('predict') preds = self.model.predict(X, **kwargs) preds, = tb_original.from_local(preds) return preds
def test_concat_df(self): df = cudf.DataFrame( dict( x1=['a', 'b', 'c'], x2=[1, 2, 3], x3=[4., 5, 6], )) tb = get_tool_box(cudf.DataFrame) # DataFrame + DataFrame df1 = tb.concat_df([df, df], axis=0) df2 = cudf.concat([df, df], axis=0) assert (df1 == df2).all().all() # DataFrame + ndarray df_num = df[['x2', 'x3']] df1 = tb.concat_df([df_num, df_num.values], axis=0) df2 = cudf.concat([df_num, df_num], axis=0) assert isinstance(df1, cudf.DataFrame) assert (df1 == df2).all().all() # Series + ndarray s = df['x2'] df1 = tb.concat_df([s, s.values], axis=0) df2 = cudf.concat([s, s], axis=0) assert isinstance(df1, cudf.Series) assert (df1 == df2).all()
def test_transform(self): df_train = dsutils.load_adult() df_train = dd.from_pandas(df_train, npartitions=2) y = df_train.pop(14) # .values X = df_train X_train, X_test, y_train, y_test = get_tool_box(X, y).train_test_split( X, y, test_size=0.2, random_state=42) conf = deeptable.ModelConfig(auto_discrete=True, auto_imputation=True, auto_encode_label=True, auto_categorize=True, apply_gbm_features=False) processor = DefaultDaskPreprocessor(conf, compute_to_local=True) X1, y1 = processor.fit_transform(X_train, y_train) X2, y2 = processor.transform(X_test, y_test) assert len( set(X1.columns.tolist()) - set([ 'x_1', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_13', 'x_0_cat', 'x_4_cat', 'x_10_cat', 'x_11_cat', 'x_12_cat', 'x_2', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_2_discrete', 'x_0_discrete', 'x_4_discrete', 'x_10_discrete', 'x_11_discrete', 'x_12_discrete' ])) == 0 assert len(set(X1.columns) - set(X2.columns)) == 0 assert X1.shape, (X_train.shape[0], 25) assert X2.shape, (X_test.shape[0], 25) assert y1.sum(), 6297 assert y2.sum(), 1544
def test_var_categorical_feature(self): X = self.df.copy() y = X.pop('rating').values.astype('float32') conf = deeptable.ModelConfig(nets=['dnn_nets'], task=consts.TASK_REGRESSION, categorical_columns=[ "movie_id", "user_id", "gender", "occupation", "zip", "title", "age" ], metrics=['mse'], fixed_embedding_dim=True, embeddings_output_dim=4, apply_gbm_features=False, apply_class_weight=True, earlystopping_patience=5, var_len_categorical_columns=[ ('genres', "|", "max") ]) dt = deeptable.DeepTable(config=conf) X_train, X_validation, y_train, y_validation = get_tool_box( X).train_test_split(X, y, test_size=0.2) model, history = dt.fit(X_train, y_train, validation_data=(X_validation, y_validation), epochs=10, batch_size=32) assert 'genres' in model.model.input_names
def train(X_train, y_train, X_eval, y_eval, task=None, reward_metric=None, optimize_direction='max', **kwargs): from hypernets.core.callbacks import SummaryCallback from hypernets.searchers import make_searcher if task is None: task, _ = get_tool_box(y_train).infer_task_type(y_train) if reward_metric is None: reward_metric = 'rmse' if task == const.TASK_REGRESSION else 'accuracy' search_space = PlainSearchSpace() searcher = make_searcher('mcts', search_space, optimize_direction=optimize_direction) callbacks = [SummaryCallback()] hm = PlainModel(searcher=searcher, task=task, reward_metric=reward_metric, callbacks=callbacks) hm.search(X_train, y_train, X_eval, y_eval, **kwargs) best = hm.get_best_trial() model = hm.final_train(best.space_sample, X_train, y_train) return hm, model
def _fix_softmax_proba(self, n_rows, proba): # proba shape should be (n, 1) if output layer is softmax if proba is None: return None else: # assert proba.shape == (n_rows, 1) # return np.insert(proba, 0, values=(1 - proba).reshape(1, -1), axis=1) return get_tool_box(proba).fix_binary_predict_proba_result(proba)
def transformers(self): import dask.dataframe as dd tfs = get_tool_box(dd.DataFrame).transformers r = DefaultDaskPreprocessor.Dummy() for k, v in tfs.items(): setattr(r, k, v) return r
def experiment_with_boston(self, init_kwargs, run_kwargs, row_count=3000, with_dask=False): if with_dask: X = self.boston y = X.pop('target') else: X = dsutils.load_boston() if row_count is not None: X = X.head(row_count) X['target'] = LabelEncoder().fit_transform(X['target']) y = X.pop('target') y = y.astype('float64') hyper_model = create_plain_model(with_encoder=True) tb = get_tool_box(X, y) X_train, X_test, y_train, y_test = \ tb.train_test_split(X, y, test_size=0.3, random_state=9527) X_train, X_eval, y_train, y_eval = \ tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527) init_kwargs = { 'X_eval': X_eval, 'y_eval': y_eval, 'X_test': X_test, **init_kwargs } compete_experiment = CompeteExperiment(hyper_model, X_train, y_train, **init_kwargs) base_experiment = Experiment(hyper_model, X_train, y_train, **init_kwargs) mydict_compete = compete_experiment.get_data_character() mydict_base = base_experiment.get_data_character() assert mydict_base assert mydict_compete assert mydict_base['experimentType'] == 'base' assert mydict_compete['experimentType'] == 'compete' assert mydict_base['target']['taskType'] == 'regression' assert mydict_base['target']['freq'] is None assert mydict_base['target']['unique'] assert mydict_base['target']['mean'] is not None assert mydict_base['target']['max'] is not None assert mydict_base['target']['min'] is not None assert mydict_base['target']['stdev'] is not None assert mydict_base['target']['dataType'] is 'float' assert len(mydict_base['targetDistribution']) <= 10 assert mydict_base['datasetShape']['X_train'] assert mydict_base['datasetShape']['y_train'] assert mydict_base['datasetShape']['X_eval'] assert mydict_base['datasetShape']['y_eval'] assert mydict_base['datasetShape']['X_test'] assert mydict_compete['featureDistribution']
def run(distribute_strategy=None, batch_size=32, epochs=5): # loading data df = dsutils.load_bank_by_dask() df_train, df_test = get_tool_box(df).train_test_split(df, test_size=0.2, random_state=42) y = df_train.pop('y') y_test = df_test.pop('y') df_train, y, df_test, y_test = dask.persist(df_train, y, df_test, y_test) # training config = deeptable.ModelConfig( nets=deepnets.DeepFM, earlystopping_patience=5, distribute_strategy=distribute_strategy, ) dt = deeptable.DeepTable(config=config) model, history = dt.fit(df_train, y, batch_size=batch_size, epochs=epochs) # save model_path = 'model_by_dask' dt.save(model_path) print(f'saved to {model_path}') # evaluation model_path = 'model_by_dask' dt2 = deeptable.DeepTable.load(model_path) result = dt2.evaluate(df_test, y_test, batch_size=512, verbose=0) print('score:', result) # scoring preds = dt2.predict( df_test, batch_size=512, ) proba = dt2.predict_proba( df_test, batch_size=512, ) print( get_tool_box(y_test).metrics.calc_score(y_test, preds, proba, metrics=['accuracy', 'auc']))
def experiment_with_movie_lens(init_kwargs, run_kwargs, row_count=None, with_dask=False): hyper_model = create_plain_model(reward_metric='f1', with_encoder=True, with_dask=with_dask) X = dsutils.load_movielens() # X['genres'] = X['genres'].apply(lambda s: s.replace('|', ' ')) X['timestamp'] = X['timestamp'].apply(datetime.fromtimestamp) if row_count is not None: X = X.head(row_count) if with_dask: setup_dask(None) X = dd.from_pandas(X, npartitions=1) y = X.pop('rating') tb = get_tool_box(X, y) X_train, X_test, y_train, y_test = \ tb.train_test_split(X, y, test_size=0.3, random_state=9527) X_train, X_eval, y_train, y_eval = \ tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527) init_kwargs = { 'X_eval': X_eval, 'y_eval': y_eval, 'X_test': X_test, 'ensemble_size': 0, 'drift_detection': False, **init_kwargs } run_kwargs = {'max_trials': 3, **run_kwargs} experiment = CompeteExperiment(hyper_model, X_train, y_train, **init_kwargs) estimator = experiment.run(**run_kwargs) assert estimator preds = estimator.predict(X_test) proba = estimator.predict_proba(X_test) if with_dask: preds, proba = tb.to_local(preds, proba) score = tb.metrics.calc_score( y_test, preds, proba, metrics=['auc', 'accuracy', 'f1', 'recall', 'precision'], task=experiment.task) print('evaluate score:', score) assert score
def setup_class(self): self.X, self.y = self.load_data() conf = deeptable.ModelConfig(task=consts.TASK_REGRESSION, metrics=[r2_c, 'RootMeanSquaredError'], apply_gbm_features=False) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ get_tool_box(self.X).train_test_split(self.X, self.y, test_size=0.2, random_state=42) self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=100)
def test_detect_estimator_lightgbm(self): tb = get_tool_box(cudf.DataFrame) detector = tb.estimator_detector( 'lightgbm.LGBMClassifier', 'binary', init_kwargs={'device': 'GPU'}, ) r = detector() assert r == {'installed', 'initialized', 'fitted'} # lightgbm dose not support cudf.DataFrame
def _get_tool_box_for_cache(*args, **kwargs): dtypes = [] for a in args: stype = str(type(a)) if stype.find('DataFrame') >= 0 or stype.find( 'array') >= 0 or stype.find('Array') >= 0: dtypes.append(type(a)) if len(dtypes) == 0: dtypes.append(pd.DataFrame) return get_tool_box(*dtypes)
def test_datetime_derivation(self): df = pd.DataFrame(data={"x1": [datetime.now()]}) tb = get_tool_box(df) ftt = tb.transformers['FeatureGenerationTransformer']( task='binary', trans_primitives=["year", "month", "week"]) ftt.fit(df) x_t = ftt.transform(df) assert "YEAR__x1__" in x_t assert "MONTH__x1__" in x_t assert "WEEK__x1__" in x_t
def experiment_with_bank_data(init_kwargs, run_kwargs, row_count=3000, with_dask=False): hyper_model = create_plain_model(with_encoder=True, with_dask=with_dask) X = dsutils.load_bank() if row_count is not None: X = X.head(row_count) X['y'] = LabelEncoder().fit_transform(X['y']) if with_dask: setup_dask(None) X = dd.from_pandas(X, npartitions=1) y = X.pop('y') tb = get_tool_box(X, y) scorer = tb.metrics.metric_to_scoring(hyper_model.reward_metric) X_train, X_test, y_train, y_test = \ tb.train_test_split(X, y, test_size=0.3, random_state=9527) X_train, X_eval, y_train, y_eval = \ tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527) init_kwargs = { 'X_eval': X_eval, 'y_eval': y_eval, 'X_test': X_test, 'scorer': scorer, 'ensemble_size': 0, 'drift_detection': False, **init_kwargs } run_kwargs = {'max_trials': 3, **run_kwargs} experiment = CompeteExperiment(hyper_model, X_train, y_train, **init_kwargs) estimator = experiment.run(**run_kwargs) assert estimator preds = estimator.predict(X_test) proba = estimator.predict_proba(X_test) if with_dask: preds, proba = tb.to_local(preds, proba) score = tb.metrics.calc_score( y_test, preds, proba, metrics=['auc', 'accuracy', 'f1', 'recall', 'precision']) print('evaluate score:', score) assert score
def setup_class(cls): df = dsutils.load_bank() df = get_tool_box(df).general_preprocessor(df).fit_transform(df) cls.bank_data = df cls.bank_data_cudf = cudf.from_pandas(df) # # cls.boston_data = dsutils.load_blood() # cls.boston_data_cudf = cudf.from_pandas(cls.boston_data) # # cls.movie_lens = dsutils.load_movielens() os.makedirs(cls.work_dir)
def experiment_start(self, exp): self.exp = exp self.steps = OrderedDict() self.running = True display_markdown('### Input Data', raw=True) X_train, y_train, X_test, X_eval, y_eval = \ exp.X_train, exp.y_train, exp.X_test, exp.X_eval, exp.y_eval tb = get_tool_box(X_train, y_train, X_test, X_eval, y_eval) display_data = (tb.get_shape(X_train), tb.get_shape(y_train), tb.get_shape(X_eval, allow_none=True), tb.get_shape(y_eval, allow_none=True), tb.get_shape(X_test, allow_none=True), exp.task if exp.task == const.TASK_REGRESSION else f'{exp.task}({tb.to_local(y_train.nunique())[0]})') display(pd.DataFrame([display_data], columns=[ 'X_train.shape', 'y_train.shape', 'X_eval.shape', 'y_eval.shape', 'X_test.shape', 'Task', ]), display_id='output_intput') try: import seaborn as sns import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder if exp.task == const.TASK_REGRESSION: # Draw Plot plt.figure(figsize=(8, 4), dpi=80) sns.kdeplot(y_train.dropna(), shade=True, color="g", label="Proba", alpha=.7, bw_adjust=0.01) else: le = LabelEncoder() y = le.fit_transform(y_train.dropna()) # Draw Plot plt.figure(figsize=(8, 4), dpi=80) sns.distplot(y, kde=False, color="g", label="y") # Decoration plt.title('Distribution of y', fontsize=22) plt.legend() plt.show() except: pass
def transform(self, X, y=None): # 1. check is fitted and values assert self.feature_defs_ is not None, 'Please fit it first.' # 2. fix input X, y = self._fix_input(X, y, for_fit=False) # 3. transform es = ft.EntitySet(id='es_hypernets_transform') feature_type_dict = self._get_feature_types(X) make_index = self.ft_index not in X.columns.to_list() if _base.FT_V0: es.entity_from_dataframe(entity_id='e_hypernets_ft', dataframe=X, variable_types=feature_type_dict, make_index=make_index, index=self.ft_index) else: if make_index: tb = get_tool_box(X) X = tb.reset_index(X) X[self.ft_index] = X.index es.add_dataframe(dataframe=X, dataframe_name='e_hypernets_ft', index=self.ft_index, make_index=False, logical_types=feature_type_dict) Xt = ft.calculate_feature_matrix(self.feature_defs_, entityset=es, n_jobs=1, verbose=False) if make_index: X.pop(self.ft_index) if self.ft_index in Xt.columns.to_list(): Xt.pop(self.ft_index) if self.categorical_as_object: cat_cols = column_category(Xt) if cat_cols: Xt[cat_cols] = Xt[cat_cols].astype('object') if self.bool_as_int: bool_cols = column_bool(Xt) if bool_cols: Xt[bool_cols] = Xt[bool_cols].astype('int') Xt = Xt.replace([np.inf, -np.inf], np.nan) if self.fix_feature_names: Xt = self._fix_transformed_feature_names(Xt) return Xt
def evaluate(self, X, y, metrics=None, **kwargs): if metrics is None: metrics = ['rmse'] if self.task == const.TASK_REGRESSION else ['accuracy'] if self.task == const.TASK_REGRESSION: proba = None preds = self.predict(X, **kwargs) else: proba = self.predict_proba(X, **kwargs) preds = self.proba2predict(proba, proba_threshold=kwargs.get('proba_threshold', 0.5)) scores = get_tool_box(y).metrics.calc_score(y, preds, proba, metrics, self.task) return scores
def setup_class(self): setup_dask(self) print("Loading datasets...") data = dd.from_pandas(dsutils.load_glass_uci(), npartitions=2) self.y = data.pop(10).values self.X = data conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, ) self.dt = deeptable.DeepTable(config=conf) self.X_train, self.X_test, self.y_train, self.y_test = \ [t.persist() for t in get_tool_box(data).train_test_split(self.X, self.y, test_size=0.2, random_state=42)] self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=3)
def test_detect_estimator_xgboost(self): pytest.importorskip('xgboost') tb = get_tool_box(cudf.DataFrame) detector = tb.estimator_detector( 'xgboost.XGBClassifier', 'binary', init_kwargs={ 'tree_method': 'gpu_hist', 'use_label_encoder': False }, ) r = detector() assert r == {'installed', 'initialized', 'fitted', 'fitted_with_cuml'}
def test_feature_tools_transformer(self): df = dsutils.load_bank() df.drop(['id'], axis=1, inplace=True) y = df.pop('y') ddf = dd.from_pandas(df.head(100), npartitions=2) tb = get_tool_box(ddf) X_train, X_test = tb.train_test_split(ddf, test_size=0.2, random_state=42) ftt = tb.transformers['FeatureGenerationTransformer']( task='binary', trans_primitives=['add_numeric', 'divide_numeric']) ftt.fit(X_train) x_t = ftt.transform(X_train) assert x_t is not None
def proba2predict(proba, *, task=None, threshold=0.5, classes=None): assert len(proba.shape) <= 2 if len(proba.shape) == 0: # empty return proba from hypernets.tabular import get_tool_box def is_one_dim(x): return len(x.shape) == 1 or (len(x.shape) == 2 and x.shape[1] == 1) if logger.is_info_enabled(): logger.info( f'proba2predict with task={task}, classes={classes}, threshold={threshold}' ) if task == const.TASK_BINARY and is_one_dim(proba): proba = get_tool_box(proba).fix_binary_predict_proba_result(proba) if task == const.TASK_REGRESSION or is_one_dim(proba): # regression return proba if proba.shape[-1] > 2: # multiclass pred = proba.argmax(axis=-1) else: # binary pred = (proba[:, -1] > threshold).astype(np.int32) if classes is not None: # if dex.is_dask_object(pred): # pred = dex.da.take(np.array(classes), pred, axis=0) # else: # pred = np.take(np.array(classes), pred, axis=0) tb = get_tool_box(pred) pred = tb.take_array(np.array(classes), pred, axis=0) return pred
def test_latlong(self): df = pd.DataFrame() df['latitude'] = [51.52, 9.93, 37.38] df['longitude'] = [-0.17, 76.25, -122.08] df['latlong'] = df[['latitude', 'longitude']].apply(tuple, axis=1) df['latitude2'] = [51.22, 9.22, 37.22] df['longitude2'] = [-0.22, 76.22, -122.22] df['latlong2'] = df[['latitude2', 'longitude2']].apply(tuple, axis=1) df = dd.from_pandas(df, npartitions=1) tb = get_tool_box(df) ftt = tb.transformers['FeatureGenerationTransformer']( latlong_cols=['latlong', 'latlong2']) x_t = ftt.fit_transform(df) print(x_t.head(3)) assert 'GEOHASH__latlong__' in x_t.columns.to_list()
def test_category_datetime_text(self): df = dsutils.load_movielens() df['genres'] = df['genres'].apply(lambda s: s.replace('|', ' ')) df['timestamp'] = df['timestamp'].apply(datetime.fromtimestamp) ddf = dd.from_pandas(df, npartitions=2) tb = get_tool_box(ddf) ftt = tb.transformers['FeatureGenerationTransformer']( task='binary', text_cols=['title'], categories_cols=['gender', 'genres']) x_t = ftt.fit_transform(ddf) xt_columns = x_t.columns.to_list() assert 'CROSS_CATEGORICAL_gender__genres' in xt_columns assert 'TFIDF__title____0__' in xt_columns assert 'DAY__timestamp__' in xt_columns