Exemple #1
0
        post_selection=None,
    )
    print("Pipeline1 created")

    print("Create AutoML pipeline...")
    automl = AutoML(
        reader,
        [
            [pipeline_lvl1],
        ],
        skip_conn=False,
    )

    print("AutoML pipeline created...")

    print("Start AutoML pipeline fit_predict...")
    start_time = time.time()
    oof_pred = automl.fit_predict(train_data, roles={"target": target})
    print("AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(
        time.time() - start_time))

    test_pred = automl.predict(test_data)
    print("Prediction for test data:\n{}\nShape = {}".format(
        test_pred, test_pred.shape))

    print("Check scores...")
    print("OOF score: {}".format(
        task.metric_func(train_data[target].values, oof_pred.data[:, 0])))
    print("TEST score: {}".format(
        task.metric_func(test_data[target].values, test_pred.data[:, 0])))
Exemple #2
0
def test_different_losses_and_metrics():
    np.random.seed(42)
    logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s',
                        level=logging.DEBUG)

    logging.debug('Load data...')
    data = pd.read_csv('../example_data/test_data_files/sampled_app_train.csv')
    logging.debug('Data loaded')

    logging.debug('Features modification from user side...')
    data['BIRTH_DATE'] = (
        np.datetime64('2018-01-01') +
        data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
    data['EMP_DATE'] = (np.datetime64('2018-01-01') +
                        np.clip(data['DAYS_EMPLOYED'], None, 0).astype(
                            np.dtype('timedelta64[D]'))).astype(str)

    data['constant'] = 1
    data['allnan'] = np.nan

    data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)
    logging.debug('Features modification finished')

    logging.debug('Split data...')
    train_data, test_data = train_test_split(data,
                                             test_size=2000,
                                             stratify=data['TARGET'],
                                             random_state=13)

    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    logging.debug(
        'Data splitted. Parts sizes: train_data = {}, test_data = {}'.format(
            train_data.shape, test_data.shape))

    for task_params, target in zip([{
            'name': 'binary'
    }, {
            'name': 'binary',
            'metric': roc_auc_score
    }, {
            'name': 'reg',
            'loss': 'mse',
            'metric': 'r2'
    }, {
            'name': 'reg',
            'loss': 'rmsle',
            'metric': 'rmsle'
    }, {
            'name': 'reg',
            'loss': 'quantile',
            'loss_params': {
                'q': .9
            },
            'metric': 'quantile',
            'metric_params': {
                'q': .9
            }
    }], ['TARGET', 'TARGET', 'AMT_CREDIT', 'AMT_CREDIT', 'AMT_CREDIT']):
        logging.debug('Create task..')
        task = Task(**task_params)
        logging.debug('Task created')

        logging.debug('Create reader...')
        reader = PandasToPandasReader(task, cv=5, random_state=1)
        logging.debug('Reader created')

        # pipeline 1 level parts
        logging.debug('Start creation pipeline_1...')
        pipe = LGBSimpleFeatures()

        logging.debug('\t ParamsTuner2 and Model2...')
        model2 = BoostLGBM(default_params={
            'learning_rate': 0.025,
            'num_leaves': 64,
            'seed': 2,
            'num_threads': 5
        })
        logging.debug('\t Tuner2 and model2 created')

        logging.debug('\t Pipeline1...')
        pipeline_lvl1 = MLPipeline(
            [model2],
            pre_selection=None,  # selector,
            features_pipeline=pipe,
            post_selection=None)
        logging.debug('Pipeline1 created')

        logging.debug('Create AutoML pipeline...')
        automl = AutoML(reader, [
            [pipeline_lvl1],
        ], skip_conn=False)

        logging.debug('AutoML pipeline created...')

        logging.debug('Start AutoML pipeline fit_predict...')
        start_time = time.time()
        oof_pred = automl.fit_predict(train_data, roles={'target': target})
        logging.debug(
            'AutoML pipeline fitted and predicted. Time = {:.3f} sec'.format(
                time.time() - start_time))

        test_pred = automl.predict(test_data)
        logging.debug('Prediction for test data:\n{}\nShape = {}'.format(
            test_pred, test_pred.shape))

        logging.debug('Check scores...')
        logging.debug('OOF score: {}'.format(
            task.metric_func(train_data[target].values, oof_pred.data[:, 0])))
        logging.debug('TEST score: {}'.format(
            task.metric_func(test_data[target].values, test_pred.data[:, 0])))
        logging.debug('Pickle automl')
        with open('automl.pickle', 'wb') as f:
            pickle.dump(automl, f)

        logging.debug('Load pickled automl')
        with open('automl.pickle', 'rb') as f:
            automl = pickle.load(f)

        logging.debug('Predict loaded automl')
        test_pred = automl.predict(test_data)
        logging.debug('TEST score, loaded: {}'.format(
            roc_auc_score(test_data['TARGET'].values, test_pred.data[:, 0])))

        os.remove('automl.pickle')