def create_linear_automl( task: Task, n_folds: int = 5, timeout: Optional[None] = None, n_reader_jobs: int = 1, cpu_limit: int = 4, # verbose: int = 0, random_state: int = 42, ): """Linear automl Args: base_task: task n_folds: number of folds timeout: Stub, not used. random_state: random_state Returns: automl: """ torch.set_num_threads(cpu_limit) reader = PandasToPandasReader(task, cv=n_folds, random_state=random_state, n_jobs=n_reader_jobs) pipe = LinearFeatures() model = LinearLBFGS() pipeline = MLPipeline([model], pre_selection=None, features_pipeline=pipe, post_selection=None) automl = AutoML(reader, [[pipeline]], skip_conn=False) # , verbose=0) return automl
print("Start creation blending...") feats_reg_2 = LinearFeatures(output_categories=True) reg_2 = LinearLBFGS() reg_lvl1 = MLPipeline( [reg_2], pre_selection=None, features_pipeline=feats_reg_2, post_selection=HighCorrRemoval(corr_co=1), ) print("End creation blending...") print("Start creation automl...") reader = PandasToPandasReader( Task("binary", ), samples=None, max_nan_rate=1, max_constant_rate=1, ) automl = AutoML( reader, [ [gbm_lvl0, reg_lvl0, reg_l1_lvl0], [reg_lvl1], ], skip_conn=False, blender=MeanBlender(), ) print("End creation automl...") print("Start fit automl...")
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format="[%(asctime)s] (%(levelname)s): %(message)s", level=logging.DEBUG) logging.debug("Load data...") data = pd.read_csv("./examples/data/sampled_app_train.csv") logging.debug("Data loaded") logging.debug("Features modification from user side...") data["BIRTH_DATE"] = ( np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]"))).astype(str) data["EMP_DATE"] = (np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype( np.dtype("timedelta64[D]"))).astype(str) data["constant"] = 1 data["allnan"] = np.nan data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) logging.debug("Features modification finished") logging.debug("Split data...") train_data, test_data = train_test_split(data, test_size=2000, stratify=data["TARGET"], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( "Data splitted. Parts sizes: train_data = {}, test_data = {}".format( train_data.shape, test_data.shape)) logging.debug("Create task...") task = Task("binary") logging.debug("Task created") logging.debug("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug("Reader created") # selector parts logging.debug("Create feature selector") model0 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 64, "seed": 42, "num_threads": 5, }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug("Feature selector created") # pipeline 1 level parts logging.debug("Start creation pipeline_1...") pipe = LGBSimpleFeatures() logging.debug("\t ParamsTuner1 and Model1...") model1 = BoostLGBM(default_params={ "learning_rate": 0.05, "num_leaves": 128, "seed": 1, "num_threads": 5, }) logging.debug("\t Tuner1 and model1 created") logging.debug("\t ParamsTuner2 and Model2...") params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2, "num_threads": 5, }) logging.debug("\t Tuner2 and model2 created") logging.debug("\t Pipeline1...") pipeline_lvl1 = MLPipeline( [model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None, ) logging.debug("Pipeline1 created") # pipeline 2 level parts logging.debug("Start creation pipeline_2...") pipe1 = LGBSimpleFeatures() logging.debug("\t ParamsTuner and Model...") model = BoostLGBM( default_params={ "learning_rate": 0.05, "num_leaves": 64, "max_bin": 1024, "seed": 3, "num_threads": 5, }, freeze_defaults=True, ) logging.debug("\t Tuner and model created") logging.debug("\t Pipeline2...") pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug("Pipeline2 created") logging.debug("Create AutoML pipeline...") automl = AutoML( reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False, ) logging.debug("AutoML pipeline created...") logging.debug("Start AutoML pipeline fit_predict...") start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"}) logging.debug( "AutoML pipeline fitted and predicted. Time = {:.3f} sec".format( time.time() - start_time)) logging.debug("Feature importances of selector:\n{}".format( selector.get_features_score())) logging.debug("oof_pred:\n{}\nShape = {}".format(oof_pred, oof_pred.shape)) logging.debug("Feature importances of top level algorithm:\n{}".format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 0:\n{}".format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( "Feature importances of lowest level algorithm - model 1:\n{}".format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug("Prediction for test data:\n{}\nShape = {}".format( test_pred, test_pred.shape)) logging.debug("Check scores...") logging.debug("OOF score: {}".format( roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0]))) logging.debug("TEST score: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) logging.debug("Pickle automl") with open("automl.pickle", "wb") as f: pickle.dump(automl, f) logging.debug("Load pickled automl") with open("automl.pickle", "rb") as f: automl = pickle.load(f) logging.debug("Predict loaded automl") test_pred = automl.predict(test_data) logging.debug("TEST score, loaded: {}".format( roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]))) os.remove("automl.pickle")
"q": 0.9 }, "metric": "quantile", "metric_params": { "q": 0.9 }, }, ], ["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"], ): print("Create task..") task = Task(**task_params) print("Task created") print("Create reader...") reader = PandasToPandasReader(task, cv=5, random_state=1) print("Reader created") # pipeline 1 level parts print("Start creation pipeline_1...") pipe = LGBSimpleFeatures() print("\t ParamsTuner2 and Model2...") model2 = BoostLGBM(default_params={ "learning_rate": 0.025, "num_leaves": 64, "seed": 2, "num_threads": 5, }) print("\t Tuner2 and model2 created")
def test_permutation_importance_based_iterative_selector(): logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) logging.debug('Create task...') task = Task('binary') logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # selector parts logging.debug('Create feature selector') model0 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': 5 }) pipe0 = LGBSimpleFeatures() pie = NpPermutationImportanceEstimator() selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) logging.debug('Feature selector created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner1 and Model1...') model1 = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': 5 }) logging.debug('\t Tuner1 and model1 created') logging.debug('\t ParamsTuner2 and Model2...') params_tuner2 = OptunaTuner(n_trials=100, timeout=100) model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline([model1, (model2, params_tuner2)], pre_selection=selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') # pipeline 2 level parts logging.debug('Start creation pipeline_2...') pipe1 = LGBSimpleFeatures() logging.debug('\t ParamsTuner and Model...') model = BoostLGBM(default_params={ 'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': 5 }, freeze_defaults=True) logging.debug('\t Tuner and model created') logging.debug('\t Pipeline2...') pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) logging.debug('Pipeline2 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], [pipeline_lvl2], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': 'TARGET'}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) logging.debug('Feature importances of selector:\n{}'.format( selector.get_features_score())) logging.debug('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape)) logging.debug('Feature importances of top level algorithm:\n{}'.format( automl.levels[-1][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 0:\n{}'.format( automl.levels[0][0].ml_algos[0].get_features_score())) logging.debug( 'Feature importances of lowest level algorithm - model 1:\n{}'.format( automl.levels[0][0].ml_algos[1].get_features_score())) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( roc_auc_score(train_data['TARGET'].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
def test_boostlgbm_and_linearlbfgs_in_one_automl_pipeline(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]')) ).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = data['TARGET'] logging.debug('Features modification finished') logging.debug('Split data...') train, test = train_test_split(data, test_size=0.2, random_state=42) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) logging.debug('Data splitted. Parts sizes: train_data = {}, test_data = {}' .format(train.shape, test.shape)) logging.debug('Start creation selector_0...') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostLGBM() imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0) logging.debug('End creation selector_0...') logging.debug('Start creation gbm_0...') feats_gbm_0 = LGBAdvancedPipeline() gbm_0 = BoostLGBM() gbm_1 = BoostLGBM() tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True) gbm_lvl0 = MLPipeline([ (gbm_0, tuner_0), gbm_1 ], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('End creation gbm_0...') logging.debug('Start creation reg_0...') feats_reg_0 = LinearFeatures(output_categories=True) reg_0 = LinearLBFGS() reg_lvl0 = MLPipeline([ reg_0 ], pre_selection=None, features_pipeline=feats_reg_0, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation reg_0...') logging.debug('Start creation composed selector...') feat_sel_1 = LGBSimpleFeatures() mod_sel_1 = BoostLGBM() imp_sel_1 = NpPermutationImportanceEstimator() selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1) logging.debug('End creation composed selector...') logging.debug('Start creation reg_l1_0...') feats_reg_1 = LinearFeatures(output_categories=False) reg_1 = LinearL1CD() reg_l1_lvl0 = MLPipeline([ reg_1 ], pre_selection=selector_1, features_pipeline=feats_reg_1, post_selection=HighCorrRemoval()) logging.debug('End creation reg_l1_0...') logging.debug('Start creation blending...') feats_reg_2 = LinearFeatures(output_categories=True) reg_2 = LinearLBFGS() reg_lvl1 = MLPipeline([ reg_2 ], pre_selection=None, features_pipeline=feats_reg_2, post_selection=HighCorrRemoval(corr_co=1)) logging.debug('End creation blending...') logging.debug('Start creation automl...') reader = PandasToPandasReader(Task('binary', ), samples=None, max_nan_rate=1, max_constant_rate=1) automl = AutoML(reader, [ [gbm_lvl0, reg_lvl0, reg_l1_lvl0], [reg_lvl1], ], skip_conn=False, blender=MeanBlender()) logging.debug('End creation automl...') logging.debug('Start fit automl...') roles = {'target': 'TARGET', DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt', } oof_pred = automl.fit_predict(train, roles=roles) logging.debug('End fit automl...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}' .format(test_pred, test_pred.shape)) not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('Check scores...') print('OOF score: {}'.format(roc_auc_score(train[roles['target']].values[not_nan], oof_pred.data[not_nan][:, 0]))) print('TEST score: {}'.format(roc_auc_score(test[roles['target']].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test) logging.debug('TEST score, loaded: {}'.format(roc_auc_score(test['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')
timer_reg = timer.get_task_timer("reg") reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) print("Linear created...") # ====================================================================================== print("Create reader...") reader = PandasToPandasReader( Task( "multiclass", metric="crossentropy", ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1, ) print("Reader created...") # ====================================================================================== print("Create blender...") blender = WeightedBlender() print("Blender created...") # ====================================================================================== print("Create AutoML...") automl = AutoML( reader=reader, levels=[[gbm_lvl0, reg_lvl0]],
def test_multiclass_task_with_catboost(): data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['report_dt'] = np.datetime64('2018-01-01') data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) data['TARGET'] = np.where( np.random.rand(data.shape[0]) > .5, 2, data['TARGET'].values) train, test = train_test_split(data, test_size=2000, random_state=42) # ====================================================================================== logging.debug('Create timer...') timer = PipelineTimer(600, mode=2) logging.debug('Timer created...') # ====================================================================================== logging.debug('Create selector...') timer_gbm = timer.get_task_timer('gbm') feat_sel_0 = LGBSimpleFeatures() mod_sel_0 = BoostCB(timer=timer_gbm) imp_sel_0 = ModelBasedImportanceEstimator() selector_0 = ImportanceCutoffSelector( feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, ) logging.debug('Selector created...') # ====================================================================================== logging.debug('Create gbms...') feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0) timer_gbm_0 = timer.get_task_timer('gbm') timer_gbm_1 = timer.get_task_timer('gbm') gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"}) gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"}) tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) gbm_lvl0 = MLPipeline([(gbm_0, tuner_0), gbm_1], pre_selection=selector_0, features_pipeline=feats_gbm_0, post_selection=None) logging.debug('Gbms created...') # ====================================================================================== logging.debug('Create linear...') feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe='auto') timer_reg = timer.get_task_timer('reg') reg_0 = LinearLBFGS(timer=timer_reg) reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) logging.debug('Linear created...') # ====================================================================================== logging.debug('Create reader...') reader = PandasToPandasReader(Task( 'multiclass', metric='crossentropy', ), samples=None, max_nan_rate=1, max_constant_rate=1, advanced_roles=True, drop_score_co=-1, n_jobs=1) logging.debug('Reader created...') # ====================================================================================== logging.debug('Create blender...') blender = WeightedBlender() logging.debug('Blender created...') # ====================================================================================== logging.debug('Create AutoML...') automl = AutoML(reader=reader, levels=[[gbm_lvl0, reg_lvl0]], timer=timer, blender=blender, skip_conn=False) logging.debug('AutoML created...') # ====================================================================================== logging.debug('Fit predict...') oof_pred = automl.fit_predict(train, roles={'target': "TARGET"}) logging.debug('Finished fitting...') test_pred = automl.predict(test) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) # ====================================================================================== logging.debug('Check scores...') # use only not nan not_nan = np.any(~np.isnan(oof_pred.data), axis=1) logging.debug('OOF score: {}'.format( log_loss(train['TARGET'].values[not_nan], oof_pred.data[not_nan]))) logging.debug('TEST score: {}'.format( log_loss(test['TARGET'].values, test_pred.data))) # ====================================================================================== for dat, df, name in zip([oof_pred, test_pred], [train, test], ['train', 'test']): logging.debug('Check aucs {0}...'.format(name)) for c in range(3): _sc = roc_auc_score((df['TARGET'].values == c).astype(np.float32), dat.data[:, c]) logging.debug('Cl {0} auc score: {1}'.format(c, _sc))
def test_different_losses_and_metrics(): np.random.seed(42) logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.DEBUG) logging.debug('Load data...') data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv') logging.debug('Data loaded') logging.debug('Features modification from user side...') data['BIRTH_DATE'] = ( np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str) data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype( np.dtype('timedelta64[D]'))).astype(str) data['constant'] = 1 data['allnan'] = np.nan data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True) logging.debug('Features modification finished') logging.debug('Split data...') train_data, test_data = train_test_split(data, test_size=2000, stratify=data['TARGET'], random_state=13) train_data.reset_index(drop=True, inplace=True) test_data.reset_index(drop=True, inplace=True) logging.debug( 'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format( train_data.shape, test_data.shape)) for task_params, target in zip([{ 'name': 'binary' }, { 'name': 'binary', 'metric': roc_auc_score }, { 'name': 'reg', 'loss': 'mse', 'metric': 'r2' }, { 'name': 'reg', 'loss': 'rmsle', 'metric': 'rmsle' }, { 'name': 'reg', 'loss': 'quantile', 'loss_params': { 'q': .9 }, 'metric': 'quantile', 'metric_params': { 'q': .9 } }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']): logging.debug('Create task..') task = Task(**task_params) logging.debug('Task created') logging.debug('Create reader...') reader = PandasToPandasReader(task, cv=5, random_state=1) logging.debug('Reader created') # pipeline 1 level parts logging.debug('Start creation pipeline_1...') pipe = LGBSimpleFeatures() logging.debug('\t ParamsTuner2 and Model2...') model2 = BoostLGBM(default_params={ 'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': 5 }) logging.debug('\t Tuner2 and model2 created') logging.debug('\t Pipeline1...') pipeline_lvl1 = MLPipeline( [model2], pre_selection=None, # selector, features_pipeline=pipe, post_selection=None) logging.debug('Pipeline1 created') logging.debug('Create AutoML pipeline...') automl = AutoML(reader, [ [pipeline_lvl1], ], skip_conn=False) logging.debug('AutoML pipeline created...') logging.debug('Start AutoML pipeline fit_predict...') start_time = time.time() oof_pred = automl.fit_predict(train_data, roles={'target': target}) logging.debug( 'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format( time.time() - start_time)) test_pred = automl.predict(test_data) logging.debug('Prediction for test data:\n{}\nShape = {}'.format( test_pred, test_pred.shape)) logging.debug('Check scores...') logging.debug('OOF score: {}'.format( task.metric_func(train_data[target].values, oof_pred.data[:, 0]))) logging.debug('TEST score: {}'.format( task.metric_func(test_data[target].values, test_pred.data[:, 0]))) logging.debug('Pickle automl') with open('automl.pickle', 'wb') as f: pickle.dump(automl, f) logging.debug('Load pickled automl') with open('automl.pickle', 'rb') as f: automl = pickle.load(f) logging.debug('Predict loaded automl') test_pred = automl.predict(test_data) logging.debug('TEST score, loaded: {}'.format( roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0]))) os.remove('automl.pickle')