def __init__( self, propensity_learner: Optional[AutoML] = None, mean_outcome_learner: Optional[AutoML] = None, effect_learner: Optional[AutoML] = None, base_task: Optional[Task] = Task("binary"), timeout: Optional[int] = None, cpu_limit: int = 4, gpu_ids: Optional[str] = "all", ): """ Args: propensity_learner: AutoML model, if `None` then will be used model by default (task must be 'binary') mean_outcome_learner: AutoML model, if `None` then will be used model by default effect_learner: AutoML model, if `None` then will be used model by default (task must be 'reg') base_task: task timeout: Timeout cpu_limit: CPU limit that that are passed to each automl. gpu_ids: GPU IDs that are passed to each automl. """ if propensity_learner is not None and self._get_task(propensity_learner).name != "binary": raise RuntimeError("Task of 'propensity_learner' must be 'binary'") if mean_outcome_learner is None and base_task is None: raise RuntimeError("Must specify 'mean_outcome_learner' or base_task") if effect_learner is not None and self._get_task(effect_learner).name != "reg": raise RuntimeError("Task of effect_learner must be 'reg'") super().__init__(base_task, timeout, cpu_limit, gpu_ids) self.propensity_learner: AutoML self.mean_outcome_learner: AutoML self.effect_learner: AutoML no_learners = (propensity_learner is None) and (mean_outcome_learner is None) and (effect_learner is None) tabular_timeout = timeout / 3 if no_learners and timeout is not None else None if propensity_learner is None: self.propensity_learner = TabularAutoML(task=Task("binary"), timeout=tabular_timeout) else: self.propensity_learner = propensity_learner if mean_outcome_learner is not None: self.mean_outcome_learner = mean_outcome_learner self.base_task = self._get_task(mean_outcome_learner) elif base_task is not None: self.mean_outcome_learner = TabularAutoML(task=base_task, timeout=tabular_timeout) if effect_learner is None: self.effect_learner = TabularAutoML(task=Task("reg"), timeout=tabular_timeout) else: self.effect_learner = effect_learner
def test_time_series_iterator_and_multiprocessed_inference(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) train, test = train_test_split(data, test_size=2000, random_state=42) # create time series iterator that is passed as cv_func cv_iter = TimeSeriesIterator(train['EMP_DATE'].astype(np.datetime64), n_splits=5, sorted_kfold=False) # train dataset may be passed as dict of np.ndarray train = { 'data': train[['AMT_CREDIT', 'AMT_ANNUITY']].values, 'target': train['TARGET'].values } task = Task('binary', ) automl = TabularAutoML( task=task, timeout=200, ) oof_pred = automl.fit_predict(train, train_features=['AMT_CREDIT', 'AMT_ANNUITY'], cv_iter=cv_iter) # prediction can be made on file by test.to_csv('temp_test_data.csv', index=False) test_pred = automl.predict('temp_test_data.csv', batch_size=100, n_jobs=4) logging.debug('Check scores...') oof_prediction = oof_pred.data[:, 0] not_empty = np.logical_not(np.isnan(oof_prediction)) logging.debug('OOF score: {}'.format( roc_auc_score(train['target'][not_empty], oof_prediction[not_empty]))) logging.debug('TEST score: {}'.format( roc_auc_score(test['TARGET'].values, test_pred.data[:, 0])))
def test_tabular_utilized_preset(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) ).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) train, test = train_test_split(data, test_size=2000, random_state=42) roles = {'target': 'TARGET', DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt', } task = Task('binary', ) automl = TabularUtilizedAutoML(task=task, timeout=600, ) oof_pred = automl.fit_predict(train, roles=roles) test_pred = automl.predict(test) logging.debug('Check scores...') # use only not nan not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('OOF score: {}'.format(roc_auc_score(train['TARGET'].values[not_nan], oof_pred.data[not_nan]))) logging.debug('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test) logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
def test_manual_pipeline(): # Read data from file logging.debug('Read data from file') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv', usecols=[ 'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT', 'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED' ]) # Fix dates and convert to date type logging.debug('Fix dates and convert to date type') data['BIRTH_DATE'] = np.datetime64( '2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]')) data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip( data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) # Create folds logging.debug('Create folds') data['__fold__'] = np.random.randint(0, 5, len(data)) # Print data head logging.debug('Print data head') print(data.head()) # # Set roles for columns logging.debug('Set roles for columns') check_roles = { TargetRole(): 'TARGET', CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'], NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'], DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'], FoldsRole(): '__fold__' } # create Task task = Task('binary') # # Creating PandasDataSet logging.debug('Creating PandasDataset') start_time = time.time() pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task) logging.debug( 'PandasDataset created. Time = {:.3f} sec'.format(time.time() - start_time)) # # Print pandas dataset feature roles logging.debug('Print pandas dataset feature roles') roles = pd_dataset.roles for role in roles: logging.debug('{}: {}'.format(role, roles[role])) # # Feature selection part logging.debug('Feature selection part') selector_iterator = FoldsIterator(pd_dataset, 1) logging.debug('Selection iterator created') model = BoostLGBM() pipe = LGBSimpleFeatures() logging.debug('Pipe and model created') model0 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'seed': 0, 'num_threads': 5 }) mbie = ModelBasedImportanceEstimator() selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10) start_time = time.time() selector.fit(selector_iterator) logging.debug( 'Feature selector fitted. Time = {:.3f} sec'.format(time.time() - start_time)) logging.debug('Feature selector scores:') logging.debug('\n{}'.format(selector.get_features_score())) # # Build AutoML pipeline logging.debug('Start building AutoML pipeline') pipe = LGBSimpleFeatures() logging.debug('Pipe created') params_tuner1 = OptunaTuner(n_trials=10, timeout=300) model1 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 128 }) logging.debug('Tuner1 and model1 created') params_tuner2 = OptunaTuner(n_trials=100, timeout=300) model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64 }) logging.debug('Tuner2 and model2 created') total = MLPipeline([(model1, params_tuner1), (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None) logging.debug('Finished building AutoML pipeline') # # Create full train iterator logging.debug('Full train valid iterator creation') train_valid = FoldsIterator(pd_dataset) logging.debug('Full train valid iterator created') # # Fit predict using pipeline logging.debug('Start AutoML pipeline fit_predict') start_time = time.time() pred = total.fit_predict(train_valid) logging.debug( 'Fit_predict finished. Time = {:.3f} sec'.format(time.time() - start_time)) # # Check preds logging.debug('Preds:') logging.debug('\n{}'.format(pred)) logging.debug('Preds.shape = {}'.format(pred.shape)) # # Predict full train dataset logging.debug('Predict full train dataset') start_time = time.time() train_pred = total.predict(pd_dataset) logging.debug('Predict finished. Time = {:.3f} sec'.format(time.time() - start_time)) logging.debug('Preds:') logging.debug('\n{}'.format(train_pred)) logging.debug('Preds.shape = {}'.format(train_pred.shape)) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(total, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: total = pickle.load(f) logging.debug('Predict loaded automl') train_pred = total.predict(pd_dataset) os.remove('automl.pickle') # # Check preds feature names logging.debug('Preds features: {}'.format(train_pred.features)) # # Check model feature scores logging.debug('Feature scores for model_1:\n{}'.format( model1.get_features_score())) logging.debug('Feature scores for model_2:\n{}'.format( model2.get_features_score()))
data["report_dt"] = np.datetime64("2018-01-01") data["constant"] = 1 data["allnan"] = np.nan data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) train, test = train_test_split(data, test_size=2000, random_state=42) roles = { "target": "TARGET", DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", } task = Task("binary", ) automl = TabularAutoML( task=task, timeout=600, general_params={ "use_algos": [ [ "linear_l2", "lgb", ], ["linear_l2", "lgb"], ], "nested_cv": True, "skip_conn": True, },
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) ).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = data['TARGET'] logging.debug('Features modification finished') logging.debug('Split data...') train, test = train_test_split(data, test_size=0.2, random_state=42) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}' .format(train.shape, test.shape)) logging.debug('Start creation selector_0...') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM() imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0) logging.debug('End creation selector_0...') logging.debug('Start creation gbm_0...') feats_gbm_0 = LGBAdvancedPipeline() gbm_0 = BoostLGBM() gbm_1 = BoostLGBM() tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True) gbm_lvl0 = MLPipeline([ (gbm_0, tuner_0), gbm_1 ], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('End creation gbm_0...') logging.debug('Start creation reg_0...') feats_reg_0 = LinearFeatures(output_categories=True) reg_0 = LinearLBFGS() reg_lvl0 = MLPipeline([ reg_0 ], pre_selection=None, features_pipeline=feats_reg_0, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation reg_0...') logging.debug('Start creation composed selector...') feat_sel_1 = LGBSimpleFeatures() mod_sel_1 = BoostLGBM() imp_sel_1 = NpPermutationImportanceEstimator() selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1) logging.debug('End creation composed selector...') logging.debug('Start creation reg_l1_0...') feats_reg_1 = LinearFeatures(output_categories=False) reg_1 = LinearL1CD() reg_l1_lvl0 = MLPipeline([ reg_1 ], pre_selection=selector_1, features_pipeline=feats_reg_1, post_selection=HighCorrRemoval()) logging.debug('End creation reg_l1_0...') logging.debug('Start creation blending...') feats_reg_2 = LinearFeatures(output_categories=True) reg_2 = LinearLBFGS() reg_lvl1 = MLPipeline([ reg_2 ], pre_selection=None, features_pipeline=feats_reg_2, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation blending...') logging.debug('Start creation automl...') reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1) automl = AutoML(reader, [ [gbm_lvl0, reg_lvl0, reg_l1_lvl0], [reg_lvl1], ], skip_conn=False, blender=MeanBlender()) logging.debug('End creation automl...') logging.debug('Start fit automl...') roles = {'target': 'TARGET', DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt', } oof_pred = automl.fit_predict(train, roles=roles) logging.debug('End fit automl...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}' .format(test_pred, test_pred.shape)) not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('Check scores...') print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0]))) print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test) logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
print("Create linear...") feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") timer_reg = timer.get_task_timer("reg") reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) print("Linear created...") # ====================================================================================== print("Create reader...") reader = PandasToPandasReader( Task( "multiclass", metric="crossentropy", # metric_params = {'multi_class': 'ovr'} ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1, ) print("Reader created...") # ====================================================================================== print("Create blender...") blender = WeightedBlender() print("Blender created...") # ====================================================================================== print("Create AutoML...")
print("End creation reg_l1_0...") print("Start creation blending...") feats_reg_2 = LinearFeatures(output_categories=True) reg_2 = LinearLBFGS() reg_lvl1 = MLPipeline( [reg_2], pre_selection=None, features_pipeline=feats_reg_2, post_selection=HighCorrRemoval(corr_co=1), ) print("End creation blending...") print("Start creation automl...") reader = PandasToPandasReader( Task("binary", ), samples=None, max_nan_rate=1, max_constant_rate=1, ) automl = AutoML( reader, [ [gbm_lvl0, reg_lvl0, reg_l1_lvl0], [reg_lvl1], ], skip_conn=False, blender=MeanBlender(), ) print("End creation automl...")
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s", level=logging.DEBUG) logging.debug("Load data...") data = pd.read_csv("./examples/data/sampled_app_train.csv") logging.debug("Data loaded") logging.debug("Features modification from user side...") data["BIRTH_DATE"] = ( np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) data["EMP_DATE"] = (np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype( np.dtype("timedelta64[D]"))).astype(str) data["constant"] = 1 data["allnan"] = np.nan data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) logging.debug("Features modification finished") logging.debug("Split data...") train_data, test_data = train_test_split(data, test_size=2000, stratify=data["TARGET"], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( "Data splitted. Parts sizes: train_data = {}, test_data = {}".format( train_data.shape, test_data.shape)) logging.debug("Create task...") task = Task("binary") logging.debug("Task created") logging.debug("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug("Reader created") # selector parts logging.debug("Create feature selector") model0 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 42, "num_threads": 5, }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug("Feature selector created") # pipeline 1 level parts logging.debug("Start creation pipeline_1...") pipe = LGBSimpleFeatures() logging.debug("\t ParamsTuner1 and Model1...") model1 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 128, "seed": 1, "num_threads": 5, }) logging.debug("\t Tuner1 and model1 created") logging.debug("\t ParamsTuner2 and Model2...") params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2, "num_threads": 5, }) logging.debug("\t Tuner2 and model2 created") logging.debug("\t Pipeline1...") pipeline_lvl1 = MLPipeline( [model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None, ) logging.debug("Pipeline1 created") # pipeline 2 level parts logging.debug("Start creation pipeline_2...") pipe1 = LGBSimpleFeatures() logging.debug("\t ParamsTuner and Model...") model = BoostLGBM( default_params={ "learning_rate": 0.05, "num_leaves": 64, "max_bin": 1024, "seed": 3, "num_threads": 5, }, freeze_defaults=True, ) logging.debug("\t Tuner and model created") logging.debug("\t Pipeline2...") pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug("Pipeline2 created") logging.debug("Create AutoML pipeline...") automl = AutoML( reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False, ) logging.debug("AutoML pipeline created...") logging.debug("Start AutoML pipeline fit_predict...") start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"}) logging.debug( "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format( time.time() - start_time)) logging.debug("Feature importances of selector:\n{}".format( selector.get_features_score())) logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape)) logging.debug("Feature importances of top level algorithm:\n{}".format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 0:\n{}".format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 1:\n{}".format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug("Prediction for test data:\n{}\nShape = {}".format( test_pred, test_pred.shape)) logging.debug("Check scores...") logging.debug("OOF score: {}".format( roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0]))) logging.debug("TEST score: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) logging.debug("Pickle automl") with open("automl.pickle", "wb") as f: pickle.dump(automl, f) logging.debug("Load pickled automl") with open("automl.pickle", "rb") as f: automl = pickle.load(f) logging.debug("Predict loaded automl") test_pred = automl.predict(test_data) logging.debug("TEST score, loaded: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) os.remove("automl.pickle")
{ "name": "reg", "loss": "quantile", "loss_params": { "q": 0.9 }, "metric": "quantile", "metric_params": { "q": 0.9 }, }, ], ["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"], ): print("Create task..") task = Task(**task_params) print("Task created") print("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) print("Reader created") # pipeline 1 level parts print("Start creation pipeline_1...") pipe = LGBSimpleFeatures() print("\t ParamsTuner2 and Model2...") model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2,
from lightautoml.automl.presets.text_presets import TabularNLPAutoML from lightautoml.tasks import Task np.random.seed(42) data = pd.read_csv("./data/avito1k_train.csv") train, test = train_test_split(data, test_size=500, random_state=42) roles = { "target": "deal_probability", "group": ["user_id"], "text": ["description", "title", "param_1", "param_2", "param_3"], } task = Task("reg") automl = TabularNLPAutoML(task=task, timeout=600) oof_pred = automl.fit_predict(train, roles=roles) test_pred = automl.predict(test) not_nan = np.any(~np.isnan(oof_pred.data), axis=1) print("Check scores...") print("OOF score: {}".format( mean_squared_error(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]))) print("TEST score: {}".format( mean_squared_error(test[roles["target"]].values, test_pred.data[:, 0]))) shutil.rmtree("./models", ignore_errors=True)
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from lightautoml.automl.presets.tabular_presets import TabularAutoML from lightautoml.tasks import Task # load and prepare data data = pd.read_csv("./data/sampled_app_train.csv") train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) # run automl automl = TabularAutoML(task=Task("binary")) oof_predictions = automl.fit_predict(train_data, roles={ "target": "TARGET", "drop": ["SK_ID_CURR"] }) te_pred = automl.predict(test_data) # calculate scores print( f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}" ) print( f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}" )
def __init__( self, outcome_learners: Optional[Sequence[AutoML]] = None, effect_learners: Optional[Sequence[AutoML]] = None, propensity_learner: Optional[AutoML] = None, base_task: Optional[Task] = None, timeout: Optional[int] = None, cpu_limit: int = 4, gpu_ids: Optional[str] = "all", ): """ Args: outcome_learners: Models predict `outcome` (base task) for each group (treatment/control), base task can be classification or regression task. It can be: two models, one model or nothing. If there is one model, then it will used for both groups. If `None` then will be used model by default. effect_learners: Models predict treatment effect. (task must be 'reg') It can be: two models, one model or nothing. If there is one model, then it will used for both groups. If `None` then will be used model by default. propensity_learner: Model predicts treatment group membership, If `None` then will be used model by default cpu_limit: CPU limit that that are passed to each automl. gpu_ids: GPU IDs that are passed to each automl. base_task: Task - 'binary' or 'reg' """ if (outcome_learners is None or len(outcome_learners) == 0) and base_task is None: raise RuntimeError('Must specify any of learners or "base_task"') if outcome_learners is not None and len(outcome_learners) > 0: base_task = self._get_task(outcome_learners[0]) super().__init__(self._get_task(outcome_learners[0])) super().__init__(base_task, timeout, cpu_limit, gpu_ids) self.learners: Dict[str, Union[Dict[str, AutoML], AutoML]] = { "outcome": {}, "effect": {}, } if propensity_learner is None: self.learners["propensity"] = self._get_default_learner(Task("binary")) else: self.learners["propensity"] = propensity_learner if outcome_learners is None or len(outcome_learners) == 0: self.learners["outcome"]["control"] = self._get_default_learner(self.base_task) self.learners["outcome"]["treatment"] = self._get_default_learner(self.base_task) elif len(outcome_learners) == 1: self.learners["outcome"]["control"] = outcome_learners[0] self.learners["outcome"]["treatment"] = copy.deepcopy(outcome_learners[0]) elif len(outcome_learners) == 2: self.learners["outcome"]["control"] = outcome_learners[0] self.learners["outcome"]["treatment"] = outcome_learners[1] else: raise RuntimeError('The number of "outcome_learners" must be 0/1/2') if effect_learners is None or len(effect_learners) == 0: self.learners["effect"]["control"] = self._get_default_learner(Task("reg")) self.learners["effect"]["treatment"] = self._get_default_learner(Task("reg")) elif len(effect_learners) == 1: self.learners["effect"]["control"] = effect_learners[0] self.learners["effect"]["treatment"] = copy.deepcopy(effect_learners[0]) elif len(effect_learners) == 2: self.learners["effect"]["control"] = effect_learners[0] self.learners["effect"]["treatment"] = effect_learners[1] else: raise RuntimeError('The number of "effect_learners" must be 0/1/2')
TIMEOUT = 3600 * 4 train = pd.read_csv("train.csv", header=0) test = pd.read_csv("test.csv", header=0) train = train.drop(['id'], axis=1) test = test.drop(['id'], axis=1) X = train.iloc[:,:-2] y = train['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) le = LabelEncoder() train['target'] = le.fit_transform(train['target']) automl = TabularUtilizedAutoML(task = Task('multiclass'), timeout = TIMEOUT, cpu_limit = N_THREADS, verbose=0, reader_params = {'n_jobs': N_THREADS} ) target_column = 'target' roles = { 'target': target_column, 'drop': ['id'] } lightml_pred = automl.fit_predict(train, roles = roles) #print('lightml_pred:\n{}\nShape = {}'.format(lightml_pred[:10], lightml_pred.shape))
def binary_task(): return Task("binary")
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) logging.debug('Create task...') task = Task('binary') logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # selector parts logging.debug('Create feature selector') model0 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': 5 }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug('Feature selector created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner1 and Model1...') model1 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': 5 }) logging.debug('\t Tuner1 and model1 created') logging.debug('\t ParamsTuner2 and Model2...') params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') # pipeline 2 level parts logging.debug('Start creation pipeline_2...') pipe1 = LGBSimpleFeatures() logging.debug('\t ParamsTuner and Model...') model = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': 5 }, freeze_defaults=True) logging.debug('\t Tuner and model created') logging.debug('\t Pipeline2...') pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug('Pipeline2 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) logging.debug('Feature importances of selector:\n{}'.format( selector.get_features_score())) logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape)) logging.debug('Feature importances of top level algorithm:\n{}'.format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 0:\n{}'.format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 1:\n{}'.format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
def run(dataset, config): log.info(f"\n**** lightautoml (R) [{__version__}] ****\n") warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) is_classification = config.type == 'classification' label = dataset.target.name df_train = dataset.train.data max_mem_size_gb = float(config.max_mem_size_mb) / 1024 task = Task(dataset.problem_type if dataset.problem_type != 'regression' else 'reg') automl = TabularUtilizedAutoML(task=task, timeout=config.max_runtime_seconds, cpu_limit=config.cores, memory_limit=max_mem_size_gb, random_state=config.seed) log.info("Training...") with Timer() as training: automl.fit_predict(train_data=df_train, roles={'target': label}) X_test, y_test = dataset.test.X, dataset.test.y log.info("Predicting on the test set...") with Timer() as predict: preds = automl.predict(X_test).data probabilities_labels = None if is_classification: probabilities = preds if dataset.problem_type == 'binary': probabilities = np.vstack( [1 - probabilities[:, 0], probabilities[:, 0]]).T predictions = np.argmax(probabilities, axis=1) class_map = automl.outer_pipes[0].ml_algos[0].models[0][ 0].reader.class_mapping if class_map is None and df_train[label].dtype == bool: class_map = {False: 0, True: 1} if class_map: column_to_class = { col: class_ for class_, col in class_map.items() } predictions = list(map(column_to_class.get, predictions)) probabilities_labels = [ column_to_class[col] for col in sorted(column_to_class) ] else: probabilities = None predictions = preds log.debug(probabilities) log.debug(config.output_predictions_file) save_artifacts(automl, config) return result( output_file=config.output_predictions_file, probabilities_labels=probabilities_labels, probabilities=probabilities, predictions=predictions, training_duration=training.duration, predict_duration=predict.duration, )
def run(dataset, config): log.info(f"\n**** lightautoml (R) [{__version__}] ****\n") save_metadata(config, version=__version__) warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) is_classification = config.type == 'classification' y_train, y_test = dataset.train.y_enc, dataset.test.y_enc column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) label = dataset.target.name df_train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False) df_train[dataset.target.name] = y_train max_mem_size_gb = float(config.max_mem_size_mb) / 1024 task = Task(dataset.problem_type if dataset.problem_type != 'regression' else 'reg') automl = TabularUtilizedAutoML(task=task, timeout=config.max_runtime_seconds, cpu_limit=config.cores, memory_limit=max_mem_size_gb, random_state=config.seed) log.info("Training...") with utils.Timer() as training: automl.fit_predict(train_data=df_train, roles={'target': label}) df_test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False) df_x_test = df_test.drop(columns=label) log.info("Predicting on the test set...") with utils.Timer() as predict: preds = automl.predict(df_x_test).data if is_classification: probabilities = preds if dataset.problem_type == 'binary': probabilities = np.vstack([ 1 - probabilities[:, 0], probabilities[:, 0] ]).T predictions = np.argmax(probabilities, axis=1) else: probabilities = None predictions = preds log.debug(probabilities) log.debug(config.output_predictions_file) save_artifacts(automl, config) return result( output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, target_is_encoded=is_classification, training_duration=training.duration, predict_duration=predict.duration, )
def test_multiclass_task_with_catboost(): data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = np.where( np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values) train, test = train_test_split(data, test_size=2000, random_state=42) # ====================================================================================== logging.debug('Create timer...') timer = PipelineTimer(600, mode=2) logging.debug('Timer created...') # ====================================================================================== logging.debug('Create selector...') timer_gbm = timer.get_task_timer('gbm') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostCB(timer=timer_gbm) imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector( feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, ) logging.debug('Selector created...') # ====================================================================================== logging.debug('Create gbms...') feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0) timer_gbm_0 = timer.get_task_timer('gbm') timer_gbm_1 = timer.get_task_timer('gbm') gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"}) gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"}) tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) gbm_lvl0 = MLPipeline([(gbm_0, tuner_0), gbm_1], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('Gbms created...') # ====================================================================================== logging.debug('Create linear...') feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto') timer_reg = timer.get_task_timer('reg') reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) logging.debug('Linear created...') # ====================================================================================== logging.debug('Create reader...') reader = PandasToPandasReader(Task( 'multiclass', metric='crossentropy', ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1) logging.debug('Reader created...') # ====================================================================================== logging.debug('Create blender...') blender = WeightedBlender() logging.debug('Blender created...') # ====================================================================================== logging.debug('Create AutoML...') automl = AutoML(reader=reader, levels=[[gbm_lvl0, reg_lvl0]], timer=timer, blender=blender, skip_conn=False) logging.debug('AutoML created...') # ====================================================================================== logging.debug('Fit predict...') oof_pred = automl.fit_predict(train, roles={'target': "TARGET"}) logging.debug('Finished fitting...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) # ====================================================================================== logging.debug('Check scores...') # use only not nan not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('OOF score: {}'.format( log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan]))) logging.debug('TEST score: {}'.format( log_loss(test['TARGET'].values, test_pred.data))) # ====================================================================================== for dat, df, name in zip([oof_pred, test_pred], [train, test], ['train', 'test']): logging.debug('Check aucs {0}...'.format(name)) for c in range(3): _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32), dat.data[:, c]) logging.debug('Cl {0} auc score: {1}'.format(c, _sc))
# load and prepare data data = pd.read_csv("./data/sampled_app_train.csv") train_data, test_data = train_test_split(data, test_size=0.2, stratify=data["TARGET"], random_state=42) def sample(optimization_search_space, trial, suggested_params): trial_values = copy.copy(suggested_params) trial_values["feature_fraction"] = trial.suggest_uniform("feature_fraction", low=0.5, high=1.0) if trial_values["feature_fraction"] > 0.7: trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0.5, high=1) else: trial_values["min_sum_hessian_in_leaf"] = trial.suggest_uniform("min_sum_hessian_in_leaf", low=0, high=0.5) return trial_values # run automl with custom search spaces automl = TabularAutoML( task=Task("binary"), lgb_params={"optimization_search_space": sample}, ) oof_predictions = automl.fit_predict(train_data, roles={"target": "TARGET", "drop": ["SK_ID_CURR"]}) te_pred = automl.predict(test_data) # calculate scores print(f"Score for out-of-fold predictions: {roc_auc_score(train_data['TARGET'].values, oof_predictions.data[:, 0])}") print(f"Score for hold-out: {roc_auc_score(test_data['TARGET'].values, te_pred.data[:, 0])}")
# train_df = train_df.drop(columns=['passengerid']) test_df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv') test_df.columns = [column.lower() for column in test_df.columns] submission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv') submission.head() feature_columns = train_df.iloc[:, 1:-1].columns.values target_column = 'target' le = LabelEncoder() train_df[target_column] = le.fit_transform(train_df[target_column]) task = Task('multiclass',) roles = { 'target': target_column, 'drop': ['id'], } automl = TabularUtilizedAutoML(task = task, timeout = TIMEOUT, cpu_limit = N_THREADS, reader_params = {'n_jobs': N_THREADS}, verbose=0 ) oof_pred = automl.fit_predict(train_df, roles = roles)
print("Create linear...") feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") timer_reg = timer.get_task_timer("reg") reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) print("Linear created...") # ====================================================================================== print("Create reader...") reader = PandasToPandasReader( Task( "multiclass", metric="crossentropy", ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1, ) print("Reader created...") # ====================================================================================== print("Create blender...") blender = WeightedBlender() print("Blender created...") # ====================================================================================== print("Create AutoML...")
def test_different_losses_and_metrics(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) for task_params, target in zip([{ 'name': 'binary' }, { 'name': 'binary', 'metric': roc_auc_score }, { 'name': 'reg', 'loss': 'mse', 'metric': 'r2' }, { 'name': 'reg', 'loss': 'rmsle', 'metric': 'rmsle' }, { 'name': 'reg', 'loss': 'quantile', 'loss_params': { 'q': .9 }, 'metric': 'quantile', 'metric_params': { 'q': .9 } }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']): logging.debug('Create task..') task = Task(**task_params) logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner2 and Model2...') model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline( [model2], pre_selection=None, # selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': target}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( task.metric_func(train_data[target].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( task.metric_func(test_data[target].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')