Esempio n. 1
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Esempio n. 2
0
    def __fit(self):
        """"""
        start = time.time()
        ##
        id_cols = [
            col for col in self.TrainData.columns
            if (col.startswith('Item_Identifier'))
        ]
        self._l_drop_cols.extend(id_cols)
        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['Item_Outlet_Sales']
        ##
        self._l_train_columns = X.columns
        print('Size of feature space: %s' % len(self._l_train_columns))
        ##
        self._model = RGFRegressor(
            algorithm=self.parameters['algorithm'],
            loss=self.parameters['loss'],
            learning_rate=self.parameters['learning_rate'],
            n_iter=self.parameters['n_iter'],
            reg_depth=self.parameters['reg_depth'],
            l2=self.parameters['l2'],
            sl2=self.parameters['sl2'],
            #min_samples_leaf= self.parameters['min_samples_leaf'],
            max_leaf=self.parameters['max_leaf'],
            verbose=True)
        self._model.fit(X, Y)
        end = time.time()
        print('\nTraining is done. Time elapsed %ds' % (end - start))

        return
Esempio n. 3
0
    def test_attributes(self):
        reg = RGFRegressor()
        attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_',
                      'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, reg, attr)
        reg.fit(self.X_train, self.y_train)
        self.assertEqual(reg.n_features_, self.X_train.shape[-1])
        self.assertTrue(reg.fitted_)
        if reg.sl2 is None:
            self.assertEqual(reg.sl2_, reg.l2)
        else:
            self.assertEqual(reg.sl2_, reg.sl2)
        if reg.min_samples_leaf < 1:
            self.assertLessEqual(reg.min_samples_leaf_,
                                 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf)
        if reg.n_iter is None:
            if reg.loss == "LS":
                self.assertEqual(reg.n_iter_, 10)
            else:
                self.assertEqual(reg.n_iter_, 5)
        else:
            self.assertEqual(reg.n_iter_, reg.n_iter)
Esempio n. 4
0
 def test_regressor(self):
     reg = RGFRegressor()
     reg.fit(self.X_train, self.y_train)
     y_pred = reg.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     print("MSE: {0:.5f}".format(mse))
     self.assertLess(mse, 6.0)
Esempio n. 5
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor(prefix='reg')
     for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Esempio n. 6
0
 def test_regressor_sparse_input(self):
     reg = RGFRegressor()
     for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix,
                           sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix):
         X_sparse = sparse_format(self.X)
         reg.fit(X_sparse, self.y)
         y_pred = reg.predict(X_sparse)
         mse = mean_squared_error(self.y, y_pred)
         self.assertLess(mse, 6.0)
Esempio n. 7
0
    def test_joblib_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        joblib.dump(reg, 'test_reg.pkl')

        # Remove model file
        _cleanup()

        reg2 = joblib.load('test_reg.pkl')
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Esempio n. 8
0
    def test_pickle(self):
        reg = RGFRegressor()
        reg.fit(self.X_train, self.y_train)
        y_pred1 = reg.predict(self.X_test)
        s = pickle.dumps(reg)

        # Remove model file
        _cleanup()

        reg2 = pickle.loads(s)
        y_pred2 = reg2.predict(self.X_test)

        np.testing.assert_allclose(y_pred1, y_pred2)
Esempio n. 9
0
    def test_cleanup(self):
        reg1 = RGFRegressor()
        reg1.fit(self.X_train, self.y_train)

        reg2 = RGFRegressor()
        reg2.fit(self.X_train, self.y_train)

        self.assertNotEqual(reg1.cleanup(), 0)
        self.assertEqual(reg1.cleanup(), 0)

        glob_file = os.path.join(_get_temp_path(), reg1._file_prefix + "*")
        self.assertFalse(glob.glob(glob_file))

        self.assertRaises(NotFittedError, reg1.predict, self.X_test)
        reg2.predict(self.X_test)
Esempio n. 10
0
 def test_parallel_gridsearch(self):
     param_grid = dict(max_leaf=[100, 300])
     grid = GridSearchCV(RGFRegressor(),
                         param_grid=param_grid, refit=True, cv=2, verbose=0, n_jobs=-1)
     grid.fit(self.X_train, self.y_train)
     y_pred = grid.best_estimator_.predict(self.X_test)
     mse = mean_squared_error(self.y_test, y_pred)
     self.assertLess(mse, 6.0)
Esempio n. 11
0
    def test_params(self):
        reg = RGFRegressor()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            verbose=True,
                            prefix='rgf_regressor',
                            inc_prefix=True,
                            clean=True)
        reg.set_params(**valid_params)
        reg.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                verbose=-1,
                                prefix='',
                                inc_prefix=1,
                                clean=0)
        for key in non_valid_params:
            reg.set_params(**valid_params)  # Reset to valid params
            reg.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
Esempio n. 12
0
    def test_input_arrays_shape(self):
        reg = RGFRegressor()

        n_samples = self.y_train.shape[0]
        self.assertRaises(ValueError, reg.fit, self.X_train,
                          self.y_train[:(n_samples - 1)])
        self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train,
                          np.ones(n_samples - 1))
        self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train,
                          np.ones((n_samples, 2)))
Esempio n. 13
0
    def test(self):
        df =pd.read_csv('MorganMACCS.csv')
        baseDf = df
        extractDf =  df['CAS'].isin(ejectCAS)
        df = df[~df['CAS'].isin(ejectCAS)]
        y = df['logTox']
        dropList = ['CAS','toxValue','logTox','HDonor', 'HAcceptors', 'AromaticHeterocycles', 'AromaticCarbocycles', 'FractionCSP3']
                    #dropList = ['CAS','toxValue','logTox']
        X = df.drop(columns=dropList)
        #Normalize
        for name in X.columns:
            if str.isdecimal(name)==True:
              if X[str(name)].sum() == 0:
                   print(name)
                   X = X.drop(columns=name)
            else:
                std =X[name].std()
                mean = X[name].mean()
                X[name] = X[name].apply(lambda x: ((x - mean) * 1 / std + 0))
        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=2)

        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        reg4.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
Esempio n. 14
0
    def test_sample_weight(self):
        reg = RGFRegressor()

        y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        y_pred_weighted = reg.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        np.random.seed(42)
        idx = np.random.choice(400, 80, replace=False)
        self.X_train[idx] = -99999  # Add some outliers
        y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt)
        weights = np.ones(self.y_train.shape[0])
        weights[idx] = np.nextafter(np.float32(0), np.float32(1))  # Eliminate outliers
        y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        mse_fixed = mean_squared_error(self.y_test, y_pred_weighted)
        self.assertLess(mse_fixed, mse_corrupt)
Esempio n. 15
0
    def test_attributes(self):
        reg = RGFRegressor()
        attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_')

        for attr in attributes:
            self.assertRaises(NotFittedError, getattr, reg, attr)
        reg.fit(self.X_train, self.y_train)
        self.assertEqual(reg.n_features_, self.X_train.shape[-1])
        self.assertTrue(reg.fitted_)
        if reg.sl2 is None:
            self.assertEqual(reg.sl2_, reg.l2)
        else:
            self.assertEqual(reg.sl2_, reg.sl2)
        if reg.min_samples_leaf < 1:
            self.assertLessEqual(reg.min_samples_leaf_, 0.5 * self.X_train.shape[0])
        else:
            self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf)
        if reg.n_iter is None:
            if reg.loss == "LS":
                self.assertEqual(reg.n_iter_, 10)
            else:
                self.assertEqual(reg.n_iter_, 5)
        else:
            self.assertEqual(reg.n_iter_, reg.n_iter)
Esempio n. 16
0
    def test_params(self):
        reg = RGFRegressor()

        valid_params = dict(max_leaf=300,
                            test_interval=100,
                            algorithm='RGF_Sib',
                            loss='Log',
                            reg_depth=1.1,
                            l2=0.1,
                            sl2=None,
                            normalize=False,
                            min_samples_leaf=9,
                            n_iter=None,
                            n_tree_search=2,
                            opt_interval=100,
                            learning_rate=0.4,
                            memory_policy='conservative',
                            verbose=True)
        reg.set_params(**valid_params)
        reg.fit(self.X_train, self.y_train)

        non_valid_params = dict(max_leaf=0,
                                test_interval=0,
                                algorithm='RGF_Test',
                                loss=True,
                                reg_depth=0.1,
                                l2=11,
                                sl2=-1.1,
                                normalize='False',
                                min_samples_leaf=0.7,
                                n_iter=11.1,
                                n_tree_search=0,
                                opt_interval=100.1,
                                learning_rate=-0.5,
                                memory_policy='Generos',
                                verbose=-1)
        for key in non_valid_params:
            reg.set_params(**valid_params)  # Reset to valid params
            reg.set_params(**{key: non_valid_params[key]})  # Pick and set one non-valid parametr
            self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
Esempio n. 17
0
 def RGF_cv(max_leaf, l2, min_samples_leaf):
     score = cross_validate(
         RGFRegressor(
             max_leaf=int(max_leaf),
             algorithm="RGF",
             test_interval=100,
             loss="LS",
             verbose=False,
             l2=l2,
             min_samples_leaf = int(min_samples_leaf)
         ),
         X, y,
         scoring='neg_mean_squared_error',
         cv=cv,n_jobs=-1)
     val = score['test_score'].mean()
     return val
def select_model(model_type, seed=1208):
    if model_type == 'ridge':
        params['solver'] = 'auto'
        params['fit_intercept'] = True
        params['alpha'] = 0.4
        params['max_iter'] = 1000
        params['normalize'] = False
        params['tol'] = 0.01
        model = Ridge(**params)
    elif model_type == 'rmf':
        params['max_depth'] = 10
        params['n_estimators'] = 3000
        params['criterion'] = 'mse'
        params['max_features'] = 0.3
        params['min_samples_leaf'] = 30
        params['min_samples_split'] = 30
        params['n_jobs'] = num_threads
        params['random_state'] = seed
        model = RandomForestRegressor(**params)
    elif model_type == 'ext':
        params['max_depth'] = 10
        params['n_estimators'] = 3000
        params['max_features'] = 'auto'
        params['min_samples_leaf'] = 30
        params['min_samples_split'] = 30
        params['n_jobs'] = num_threads
        params['random_state'] = seed
        model = ExtraTreesRegressor(**params)
    elif model_type == 'rgf':
        #  params['reg_depth'] = 10
        params['max_leaf'] = 2000
        params['loss'] = "LS"
        params['n_tree_search'] = 3000
        params['min_samples_leaf'] = 30
        params['learning_rate'] = 0.01
        params['verbose'] = True
        params['algorithm'] = "RGF"
        params['test_interval'] = 100
        model = RGFRegressor(**params)

    return model
Esempio n. 19
0
 def __init__(self, task, fast=False):
     if task == 'classification':
         self.metric = 'roc_auc'
         self.task = "classification"
         if fast:
             self.model = FastRGFClassifier()
         else:
             self.model = RGFClassifier(loss="Log")
     else:
         self.metric = 'neg_mean_squared_error'
         self.task = "regression"
         if fast:
             self.model = FastRGFRegressor()
         else:
             self.model = RGFRegressor(loss="LS", normalize=True)
     self.X_test = None
     self.X_train = None
     self.y_test = None
     self.y_train = None
     self.grid_search = None
     self.y_predict = None
     self.test_score = None
Esempio n. 20
0
    def test_sample_weight(self):
        reg = RGFRegressor()

        y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        y_pred_weighted = reg.fit(self.X_train,
                                  self.y_train,
                                  np.ones(self.y_train.shape[0])
                                  ).predict(self.X_test)
        np.testing.assert_allclose(y_pred, y_pred_weighted)

        np.random.seed(42)
        idx = np.random.choice(400, 80, replace=False)
        self.X_train[idx] = -99999  # Add some outliers
        y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test)
        mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt)
        weights = np.ones(self.y_train.shape[0])
        weights[idx] = np.nextafter(np.float32(0), np.float32(1))  # Eliminate outliers
        y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test)
        mse_fixed = mean_squared_error(self.y_test, y_pred_weighted)
        self.assertLess(mse_fixed, mse_corrupt)
Esempio n. 21
0
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(
    lambda x: int(x[0]) * 60 + int(x[1]))
df_train_set['is_weekend'] = np.where(df_train_set['travel_date'] >= 5, 1, 0)

#print(df_train_set.head(5))

# ------ model
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.25, shuffle=True)

model = RGFRegressor(max_leaf=4500,
                     algorithm="RGF_Sib",
                     test_interval=50,
                     loss="LS",
                     verbose=False)

scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5)
print(sum(scores) / len(scores))

#print(X.head(5))
#model.fit(X, y)
#preds_train_set = model.predict(X_test)

#print(mean_absolute_error(preds_train_set, y_test))

sys.exit()

# ----------------------
Esempio n. 22
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/domain2_var1/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])
    pca0_cols = [c for c in pca_cols if 'k0' in c]

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['domain2_var1'].copy().reset_index(drop=True)
    d21_index = list(train['domain2_var1'].dropna().index)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train.loc[d21_index, prefix + '_score'] = np.load(
            folder_preds + '{}_score_seed{}.npy'.format(prefix, seed))
        test.loc[:, prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_factor=0.2,
                                                  scale_cols=pca0_cols,
                                                  sc=StandardScaler())

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # learning process on different datasets
    names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        GaussianProcessRegressor(DotProduct(), random_state=0),
        NuSVR(C=3, kernel='rbf'),
        NuSVR(C=3, kernel='rbf'),
        Lasso(alpha=0.1, random_state=0),
        BaggingRegressor(Ridge(alpha=1),
                         n_estimators=100,
                         max_samples=0.2,
                         max_features=0.2,
                         random_state=0)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2,
                        names,
                        is_blend=True)

    # rewrite folders for models and preds
    folder_models = './models/domain2_var1/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    d21_prediction = pd.DataFrame()
    d21_prediction['Id'] = test['Id'].values
    d21_prediction['pred'] = preds
    d21_prediction.to_csv(folder_preds +
                          'domain2_var1_stack_seed{}.csv'.format(seed),
                          index=False)
    print('domain2_var1 seed pred is saved as',
          folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
Esempio n. 23
0
def rgf_state_prediction(state, lookback, horizon, predictors):
    clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state))

    for cluster in clusters:
        data_full, group = get_cluster_data(geocode=cluster[0],
                                            clusters=clusters,
                                            data_types=DATA_TYPES,
                                            cols=predictors)
        for city in cluster:
            if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format(
                    state, city)):
                print(city, 'done')
                continue

            target = 'casos_est_{}'.format(city)
            casos_est_columns = ['casos_est_{}'.format(i) for i in group]
            casos_columns = ['casos_{}'.format(i) for i in group]

            data = data_full.drop(casos_columns, axis=1)
            data_lag = build_lagged_features(data, lookback)
            data_lag.dropna()
            targets = {}
            for d in range(1, horizon + 1):
                if d == 1:
                    targets[d] = data_lag[target].shift(-(d - 1))
                else:
                    targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)]

            X_data = data_lag.drop(casos_est_columns, axis=1)
            X_train, X_test, y_train, y_test = train_test_split(
                X_data,
                data_lag[target],
                train_size=0.7,
                test_size=0.3,
                shuffle=False)

            city_name = get_city_names([city, 0])[0][1]
            preds = np.empty((len(data_lag), horizon))
            metrics = pd.DataFrame(index=('mean_absolute_error',
                                          'explained_variance_score',
                                          'mean_squared_error',
                                          'mean_squared_log_error',
                                          'median_absolute_error', 'r2_score'))
            for d in range(1, horizon + 1):
                model = RGFRegressor(max_leaf=300,
                                     algorithm="RGF_Sib",
                                     test_interval=100,
                                     loss="LS",
                                     verbose=False)

                tgt = targets[d][:len(X_train)]
                tgtt = targets[d][len(X_train):]
                try:
                    model.fit(X_train, tgt)
                except ValueError as err:
                    print(
                        '-----------------------------------------------------'
                    )
                    print(city, 'ERRO')
                    print(
                        '-----------------------------------------------------'
                    )
                    break
                pred = model.predict(X_data[:len(targets[d])])

                dif = len(data_lag) - len(pred)
                if dif > 0:
                    pred = list(pred) + ([np.nan] * dif)
                preds[:, (d - 1)] = pred
                pred_m = model.predict(X_test[:(len(tgtt))])
                metrics[d] = calculate_metrics(pred_m, tgtt)

            metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format(
                'saved_models/rgf', state, city))
            plot_prediction(preds, targets[1], city_name, len(X_train))
            # plt.show()
    return None
Esempio n. 24
0
        kf = model_selection.KFold(n_splits=nfold,
                                   shuffle=False,
                                   random_state=seed)
        for dev_index, val_index in kf.split(
                y
        ):  # explain for regression convert y to bins and use that for split
            dev_X, val_X = train.iloc[dev_index, :], train.iloc[val_index, :]
            dev_y, val_y = y[dev_index], y[val_index]
            dev_X = dev_X[(dev_y > lbound) & (dev_y < ubound)]
            dev_y = dev_y[(dev_y > lbound) & (dev_y < ubound)]
            val_X2 = val_X[(val_y > lbound) & (val_y < ubound)]
            val_y2 = val_y[(val_y > lbound) & (val_y < ubound)]
            print(dev_X.shape)
            rgf = RGFRegressor(max_leaf=1000,
                               algorithm="RGF_Sib",
                               test_interval=100,
                               loss="LS",
                               learning_rate=0.01,
                               verbose=False)

            model = rgf.fit(dev_X, dev_y)
            print("predicting..")
            preds = model.predict(val_X)
            oobval[val_index] += preds.reshape(-1, 1)
            valerr.append(mean_absolute_error(val_y, preds))
            print(valerr, "mean:", np.mean(valerr), "std:", np.std(valerr))
            oobtest += model.predict(test.values).reshape(-1, 1)
            val_scores.append(mean_absolute_error(model.predict(valid), yval))
            del (rgf, model)
            gc.collect()

            print(val_scores, np.mean(val_scores), "---", np.std(val_scores))
Esempio n. 25
0
# XGBoost
if ml_algorithm == 'XGBoost': 
	num_rounds = 10
	params = {'eta': 0.01, 'max_depth': 18, 'colsample_bytree': 0.2, 'subsample': 0.8, 
               'colsample_bylevel':0.3, 'alpha':2, 'objective': 'reg:linear', 
               'eval_metric': 'rmse', 'seed': 99, 'silent': True}
                # 'objective': 'binary:logistic', 'eval_metric': 'auc'
# LightGBM
elif ml_algorithm == 'LightGBM': 
	num_rounds = 10
	params = {'learning_rate': 0.01, 'max_depth': 13, 'colsample_bytree': 0.2, 
               'num_leaves' : 580, 'application': 'regression', 'metric': 'rmse', 
               'seed': 99, 'silent': True}
# RGFRegressor 
elif ml_algorithm == 'RGF':
	model = RGFRegressor(max_leaf=3500, algorithm='RGF_Opt', loss="LS", l2=0.01)  
# FastRGFRegressor 
elif ml_algorithm == 'FastRGF':     
	model = FastRGFRegressor(n_estimators=1200, sparse_max_features=1500, max_depth=5, 
                              max_bin=150, min_samples_leaf=12, sparse_min_occurences=1, 
                              opt_algorithm='epsilon-greedy', l2=1.0, min_child_weight=210.0, 
								learning_rate=0.2) 
# Ridge Regression
elif ml_algorithm == 'Ridge': 
	model = Ridge(alpha=.6, copy_X=True, fit_intercept=True, max_iter=100, 
                   normalize=False, random_state=101, solver='auto', tol=0.01)
# Lasso Regression
elif ml_algorithm == 'Lasso': 
	model = Lasso(alpha=.6, copy_X=True, fit_intercept=True, max_iter=100, 
                   normalize=False, random_state=101, tol=0.01)
	
from sklearn.ensemble import RandomForestRegressor
from rgf.sklearn import FastRGFRegressor, RGFRegressor

boston = load_boston()
rng = check_random_state(42)
perm = rng.permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]

train_x = boston.data[:300]
test_x = boston.data[300:]
train_y = boston.target[:300]
test_y = boston.target[300:]

start = time.time()
reg = RGFRegressor()
reg.fit(train_x, train_y)
score = reg.score(test_x, test_y)
end = time.time()
print("RGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
reg = FastRGFRegressor()
reg.fit(train_x, train_y)
score = reg.score(test_x, test_y)
end = time.time()
print("FastRGF: {} sec".format(end - start))
print("score: {}".format(score))

start = time.time()
Esempio n. 27
0
    def stacklearning(self):
        class extAll(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return self

            def predict(self, X):
                return self

        class extMorgan(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                _,morgan,_=sepTables(X)
                return morgan
        class extMACCS(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,_=sepTables(X)
                maccs = pd.concat([morgan,maccs],axis=1)

                return maccs

        class extDescriptor(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,descriptor=sepTables(X)
                descriptor = pd.concat([morgan,descriptor],axis=1)
                descriptor = pd.concat([maccs,descriptor],axis=1)
                return descriptor

        class extPCA(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                model = PCA(n_components=64)
                _,morgan,_=sepTables(X)
                morgan = morgan.reset_index().drop('index', axis=1)
                W = pd.DataFrame(model.fit_transform(X))
                W = pd.concat([morgan,W],axis=1)
                return W

        lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)

        pipe1 = make_pipeline(extMACCS(), rgf)
        pipe2 = make_pipeline(extMorgan(), rgf1)
        pipe3 = make_pipeline(extDescriptor(), rgf2)
        pipe4 = make_pipeline(extPCA(), rgf3)
        pipe7 =make_pipeline(extDescriptor(), rgf4)
        pipe8 =make_pipeline(extDescriptor(), rgf4)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')
        sgd = SGDRegressor(max_iter=1000)
        pls = PLSRegression(n_components=3)
        ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        pipe5 = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)

        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1)
        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1)

        scores = cross_val_score(stack2, X, y, cv=10)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking'))
        stack1_score = cross_val_score(stack1,X,y, cv=10)
        rgf_score = cross_val_score(rgf,X,y,cv=10)

        stack2.fit(X_train, y_train)
        y_pred = stack2.predict(X_train)
        y_val = stack2.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        rgf.fit(X_train, y_train)
        y_pred = rgf.predict(X_train)
        y_val = rgf.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        pipe1.fit(X_train, y_train)
        y_pred = pipe1.predict(X_train)
        y_val = pipe1.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))


        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        rgf.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
elif ml_algorithm == 'LightGBM':
    num_rounds = 10
    params = {
        'learning_rate': 0.01,
        'max_depth': 13,
        'colsample_bytree': 0.2,
        'num_leaves': 580,
        'objective': 'binary',
        'metric': 'auc',
        'seed': 99,
        'silent': True
    }
# RGFRegressor
elif ml_algorithm == 'RGF':
    model = RGFRegressor(max_leaf=3500,
                         algorithm='RGF_Opt',
                         loss="LS",
                         l2=0.01)
# FastRGFRegressor
elif ml_algorithm == 'FastRGF':
    model = FastRGFRegressor(n_estimators=1200,
                             sparse_max_features=1500,
                             max_depth=5,
                             max_bin=150,
                             min_samples_leaf=12,
                             sparse_min_occurences=1,
                             opt_algorithm='epsilon-greedy',
                             l2=1.0,
                             min_child_weight=210.0,
                             learning_rate=0.2)
# Ridge Regression
elif ml_algorithm == 'Ridge':
Esempio n. 29
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/age/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['age'].copy().reset_index(drop=True)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge(),
        HuberRegressor(epsilon=2.5, alpha=1),
        OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 5, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 5, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 4, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 4, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['RGF', 'ENet', 'BRidge']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')
    print('Reading scores from ', folder_preds)

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train[prefix +
              '_score'] = np.load(folder_preds +
                                  '{}_score_seed{}.npy'.format(prefix, seed))
        test[prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(
            set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) -
            set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_cols=pca_cols)

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # dl
    dict_cols = sorted(
        list(
            set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) -
            set(['IC_20'])))
    train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols)

    # learning process on different datasets
    names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        MLPRegressor(activation='tanh', random_state=0),
        RGFRegressor(max_leaf=1500, loss='Abs'),
        NuSVR(C=10, nu=0.4, kernel='rbf'),
        BayesianRidge(),
        OrthogonalMatchingPursuitCV(),
        ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0),
        KernelRidge(kernel='poly', alpha=0.5)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 +
            [train_dl] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 +
                        [test_dl] * 2,
                        names,
                        is_blend=False)

    # rewrite folders for models and preds
    folder_models = './models/age/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/age/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    folds = KFold(n_splits=10, shuffle=True, random_state=0)
    stack = pd.DataFrame(zoo.oof_preds).T
    stack.columns = names

    model_stacker_rgf = RGFRegressor(max_leaf=1000,
                                     reg_depth=25,
                                     verbose=False)
    rgf_pred = cross_val_predict(model_stacker_rgf,
                                 stack,
                                 y.dropna(),
                                 cv=folds,
                                 n_jobs=-1)

    model_stacker_br = BayesianRidge()
    br_pred = cross_val_predict(model_stacker_br,
                                stack,
                                y.dropna(),
                                cv=folds,
                                n_jobs=-1)

    model_stacker_rgf.fit(stack, y.dropna())
    model_stacker_br.fit(stack, y.dropna())

    # save models
    save_pickle(model_stacker_br,
                folder_models + 'BRidge_stack_seed{}'.format(seed))
    save_pickle(model_stacker_rgf,
                folder_models + 'RGF_stack_seed{}'.format(seed))
    print('Final age NMAE: {:.5f}'.format(
        NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred)))

    test_preds = pd.DataFrame(preds).T
    test_preds.columns = names

    age_prediction = pd.DataFrame()
    age_prediction['Id'] = test['Id'].values
    age_prediction['pred'] = 0.25 * model_stacker_rgf.predict(
        test_preds) + 0.75 * model_stacker_br.predict(test_preds)
    age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed),
                          index=False)
    print('age seed pred is saved as',
          folder_preds + 'age_stack_seed{}.csv'.format(seed))
Esempio n. 30
0
class RGF(ModelBase):
    """"""

    _l_drop_cols = ['Item_Outlet_Sales', 'index']

    ## training, parameter tuning for single L1
    def train(self, importance=False):
        """"""
        print('\n parameters %s \n' % self.parameters)
        d_fold_val = {}
        for fold in range(self.kfold):
            print('\n---- fold %s begins.\n' % fold)

            ## load data
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## train and predict on valid
            self.__fit()
            eval = self.__predict()
            d_fold_val[fold] = eval

            ## save
            OutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(OutputDir) == False):
                os.makedirs(OutputDir)
            DataUtil.save(self.TrainData,
                          '%s/train.%s' % (OutputDir, self.data_format),
                          format=self.data_format)
            DataUtil.save(self.TestData,
                          '%s/test.%s' % (OutputDir, self.data_format),
                          format=self.data_format)

            print('\n---- Fold %d done. ----\n' % fold)

        return d_fold_val

    ## inferring for fold data and holdout data
    def infer(self, head, HoldoutData, SubmitData, metric_pk=False):
        """"""
        ##
        l_pred_fold = []
        PredHoldout = pd.DataFrame(index=HoldoutData.index)
        PredHoldout['index'] = HoldoutData['index']
        PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales']
        PredSubmit = pd.DataFrame(index=SubmitData.index)
        for fold in range(self.kfold):
            ## load
            TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold,
                                                  self.data_format)
            TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold,
                                                self.data_format)
            self.TrainData = DataUtil.load(TrainFile, format=self.data_format)
            self.TestData = DataUtil.load(TestFile, format=self.data_format)

            ## fit
            PredFold = pd.DataFrame(index=self.TestData.index)
            PredFold['index'] = self.TestData['index']
            PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales']
            PredFold['fold'] = fold
            self.__fit()

            ## inferring
            PredFold[head] = self._model.predict(
                self.TestData[self._l_train_columns])
            PredHoldout['fold%s' % (fold)] = self._model.predict(
                HoldoutData[self._l_train_columns])
            PredSubmit['fold%s' % fold] = self._model.predict(
                SubmitData[self._l_train_columns])
            l_pred_fold.append(PredFold)
        ## aggregate folds data
        PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True)
        ## save for folds data
        for fold in range(self.kfold):
            FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold)
            if (os.path.exists(FoldOutputDir) == False):
                os.makedirs(FoldOutputDir)
            TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format)
            TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format)

            TrainData = PredKFold[PredKFold['fold'] != fold]
            TestData = PredKFold[PredKFold['fold'] == fold]
            DataUtil.save(TrainData, TrainFile, format=self.data_format)
            DataUtil.save(TestData, TestFile, format=self.data_format)

        HoldCols = [
            col for col in PredHoldout.columns if col.startswith('fold')
        ]
        ## save for holdout data
        PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1)
        HoldoutOutputDir = '%s/holdout' % self.OutputDir
        if (os.path.exists(HoldoutOutputDir) == False):
            os.makedirs(HoldoutOutputDir)
        DataUtil.save(PredHoldout,
                      '%s/test.%s' % (HoldoutOutputDir, self.data_format),
                      format=self.data_format)
        ## save for submit data
        PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1)
        SubmitOutputDir = '%s/submit' % self.OutputDir
        if (os.path.exists(SubmitOutputDir) == False):
            os.makedirs(SubmitOutputDir)
        DataUtil.save(PredSubmit,
                      '%s/test.%s' % (SubmitOutputDir, self.data_format),
                      format=self.data_format)

        ## metric PK
        if (metric_pk):
            d_metric = {}
            for col in self._l_train_columns:
                diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales'])
                rmse = np.sqrt(np.sum(diff * diff) / len(diff))
                d_metric[col] = rmse
            diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales']
            ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff))
            print('\n===== metric pk result ====\n')
            print('single model: %s, ensemble model %s: %s' %
                  (d_metric, head, ensemble_metric))
            print('\n===== metric pk result ====\n')

        return

    ## L1 fitting
    def __fit(self):
        """"""
        start = time.time()
        ##
        id_cols = [
            col for col in self.TrainData.columns
            if (col.startswith('Item_Identifier'))
        ]
        self._l_drop_cols.extend(id_cols)
        X = self.TrainData.drop(self._l_drop_cols, axis=1)
        Y = self.TrainData['Item_Outlet_Sales']
        ##
        self._l_train_columns = X.columns
        print('Size of feature space: %s' % len(self._l_train_columns))
        ##
        self._model = RGFRegressor(
            algorithm=self.parameters['algorithm'],
            loss=self.parameters['loss'],
            learning_rate=self.parameters['learning_rate'],
            n_iter=self.parameters['n_iter'],
            reg_depth=self.parameters['reg_depth'],
            l2=self.parameters['l2'],
            sl2=self.parameters['sl2'],
            #min_samples_leaf= self.parameters['min_samples_leaf'],
            max_leaf=self.parameters['max_leaf'],
            verbose=True)
        self._model.fit(X, Y)
        end = time.time()
        print('\nTraining is done. Time elapsed %ds' % (end - start))

        return

    ## predict
    def __predict(self):
        """"""
        start = time.time()
        ##
        x_test = self.TestData[self._l_train_columns]
        pred_test = self._model.predict(x_test)
        truth_test = self.TestData['Item_Outlet_Sales']
        ## RMSE
        diff = (pred_test - truth_test)
        rmse = np.sqrt(np.sum(diff * diff) / len(diff))

        ##
        end = time.time()
        print('\n Prediction done. Time consumed %ds' % (end - start))

        return rmse
Esempio n. 31
0
    def stacklearning(self):
        class sparseNorm(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                from sklearn import preprocessing
                Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values))
                return Y
        fm = sgd.FMRegression(
            n_iter=4743,
            init_stdev=0.1,
            rank=100,
            l2_reg_w=0,
            l2_reg_V=0,
            step_size=0.1,
        )
        fm = sgd.FMRegression(
            n_iter=9943,
            init_stdev=0.1,
            rank=219,
            l2_reg_w=0,
            l2_reg_V=0.06454,
            step_size=0.1,
        )
        pipe = make_pipeline(sparseNorm(), fm)
        calcACC(pipe, X=X2)

        xgb = xgboost.XGBRegressor(
                    n_estimators=100,
                    max_depth=7,
                    gamma=0,
                    colsample_bytree=0.1
                )
        lgbm = LGBMRegressor(
            boosting_type='gbdt', num_leaves=367,
            learning_rate=0.06,feature_fraction=0.14,
            max_depth=28, min_data_in_leaf=8
        )
        rgf = RGFRegressor(
            max_leaf=1211, algorithm="RGF", test_interval=100,
            loss="LS", verbose=False, l2=0.93,
            min_samples_leaf=2
        )
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=56,min_samples_split=2,
            max_features=0.21
        )
        rf = RandomForestRegressor()
        ext = ExtraTreesRegressor(
            n_estimators=384,max_features= 2228,
            min_samples_split= 0.01,max_depth= 856,
            min_samples_leaf= 1
        )
        svr = SVR(
            gamma=9.5367431640625e-07,
            epsilon=0.0009765625,
            C= 2048.0
        )

        #test combination
        desNew = make_pipeline(extdescriptorNew(),rf)
        morNew = make_pipeline(extMorganNew(),rf)
        kotNew = make_pipeline(extklekotaTothNew(),rf)
        macNew = make_pipeline(extMACCSNew(),rf)

        desMac = make_pipeline(extDescriptorMACCS(),rf)
        morMac = make_pipeline(extMorganMACCS(),rf)
        kotMac = make_pipeline(extKlekotaTothMACCS(),rf)

        morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf)
        des = make_pipeline(extOnlyDescriptor(),rf)
        mor = make_pipeline(extOnlyMorgan(),rf)
        kot = make_pipeline(extOnlyklekotaToth(),rf)
        mac = make_pipeline(extOnlyMACCS(),rf)
        all = make_pipeline(extAll(),rf)
        allwithoutNew = make_pipeline(extAllwithoutNew(),rf)
        allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf)
        allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf)

        testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew,
        "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without "
                                                                      "new":allwithoutNew,
                   "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes}

        #10fold
        cv = KFold(n_splits=10, shuffle=True, random_state=0)

        #Fingerprinttest
        resultDic={}
        resultDic2={}
        for name,model in testDic.items():
            #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1)
            #calcACC(model,X=X,y=y2,name=name)

            Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc)
            RMSETmp = Scores['test_RMSE'].mean()
            CORRTmP = Scores['test_Correlation coefficient'].mean()
            resultDic.update({name:[RMSETmp,CORRTmP]})
            print(name,RMSETmp,CORRTmP)

        #stacking
        alldata = make_pipeline(extAll())
        # random forest
        #1.1546 0.70905
        stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1)

        # Light Gradient boosting
        # 1.160732 0.703776
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1)

        # XGboost
        # 1.1839805 0.689571
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1)

        # Regularized greedily forest
        # 1.17050 0.6992
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1)

        #pls 22.808047774809697 0.6410026452910016 i=4
        for i in np.arange(3,11,1):
            pls = PLSRegression(n_components=i)
            testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0)
            calcACC(testmodel)
        pls = PLSRegression(n_components=4)

        #SVR
        svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592,
                  epsilon=0.0009765625,)
        svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0)

        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1)
        calcACC(svr)

        #Extratree  1.157420824123527 0.7061010221224269
        testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1)
        calcACC(testmodel)

        #k-NN
        nbrs = KNeighborsRegressor(3)

        ##Linear regressions
        #Stochastic Gradient Descenta
        sgd = SGDRegressor(max_iter=1000)
        # Ridge
        for i in [1,10,100,1000]:
            ridge = Ridge(alpha=i)
            calcACC(ridge)
        ridge = Ridge(alpha=45.50940042350705)
        calcACC(ridge)
        # multiple linear
        lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1))
        calcACC(lin)



        #stacking
        #0.69
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1)
        #1.1532 0.70926
        testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,
                              verbose=1)
        #1.16420 0.7041
        testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1)
        #1.16379 0.7044
        stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1)
        testmodel  = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1)
        #1.1535496740699531 0.7108839199109559
        pcaFeature = make_pipeline(extPCA())
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=rf,verbose=1)
        #1.181801005432221 0.6889745579620922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf]
                                      ,meta_regressor=lgbm,verbose=1)
        #0.70613
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=xgb,verbose=1)
        #0.71641717
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)
        #0.7146922
        testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext]
                                      ,meta_regressor=rf,verbose=1)

        #new features
        pcaFeature = make_pipeline(extPCA())

        #old
        pipe1 = make_pipeline(extMACCS(), rf)
        pipe2 = make_pipeline(extMorgan(), rf)
        pipe3 = make_pipeline(extDescriptor(), rf)

        pipe4 = make_pipeline(extPCA(), rgf)
        pipe7 =make_pipeline(extDescriptor(), rgf)
        pipe8 =make_pipeline(extDescriptor(), rgf)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')

        pls = PLSRegression(n_components=4)

        extMACCSdata = make_pipeline(extMACCS())

        nbrsPipe = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())
        ave = extAverage()
        withoutdesc =  make_pipeline(extMACCS())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)
        #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1)

        #0.70
        stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1)

        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)

        #0.69######################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        #0.70
        stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1)

        #0.71
        stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1)
        ###########################
        ###########################
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1)
        stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1)
        ###########################

        #stackingwithknn
        stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1)


        #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1)

        cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        cv = KFold(n_splits=10, shuffle=True, random_state=0)
        St1Scores = cross_validate(stack1,X,y,cv=cv)
        St1Scores['test_score'].mean()**(1/2)

        St2Scores = cross_validate(stack2,X,y,cv=cv)
        St2Scores['test_score'].mean()**(1/2)

        St3Scores = cross_validate(stack3,X,y,cv=cv)
        St3Scores['test_score'].mean()**(1/2)

        stackScore = cross_validate(stack, X, y, cv=cv)
        stackScore['test_score'].mean()**(1/2)

        lgbmScores =cross_validate(lgbm,X,y,cv=cv)
        lgbmScores['test_score'].mean()**(1/2)

        rgfScores = cross_validate(rgf,X,y,cv=cv)
        rgfScores['test_score'].mean()**(1/2)

        RFScores = cross_validate(rf,X,y,cv=cv)
        RFScores['test_score'].mean()**(1/2)

        scores = cross_validate(stack2,X,y,cv=cv)
        scores['test_score'].mean()**(1/2)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking'))

        stack3.fit(X, y)
        y_pred = stack3.predict(X_train)
        y_val = stack3.predict(X_test)
        #stack3.score(X_train, y_train)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(stack3.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        stack1.fit(X, y)
        valy =  (10 **(stack1.predict(exX))).tolist()

        sgd.fit(X,y)
        valy =  (10 **(sgd.predict(exX))).tolist()

        rgfpipe = make_pipeline(extMACCS(), rf)
        rgf.fit(X,y)
        valy =  (10 **(rgf.predict(exX))).tolist()

        nbrs.fit(X,y)
        valy =  (10 **(nbrs.predict(exX))).tolist()

        pipe = make_pipeline(extMACCS(), rf)
        pipe.fit(X,y)
        valy =  (10 **(pipe.predict(exX))).tolist()


        rf.fit(X, y)
        y_pred = rf.predict(X_train)
        y_val = rf.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(rf.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        lgbm.fit(X, y)
        #y_pred = pipe1.predict(X_train)
        #y_val = pipe1.predict(X_test)
        exX = preprocess(extractDf, changeList)
        valy =  (10 **(lgbm.predict(exX))).tolist()
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
Esempio n. 32
0
    def metalPredictotherML(self):
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=134,min_samples_split=9,
            max_features=0.33
        )
        xg=xgboost.XGBRegressor(
            n_estimators=9196,
            max_depth=497,
            gamma=0.0,
            colsample_bytree= 1
        )
        xg=xgboost.XGBRegressor(
            n_estimators=7534,
            max_depth=1000,
            gamma=2.0,
            colsample_bytree= 0.1
        )
        lgbm = LGBMRegressor(
            boosting_type='gbdt',
            num_leaves=662,
            feature_fraction= 0.36875625810601625,
            bagging_fraction=0.39668072810414723,
            learning_rate=0.06,
            min_data_in_leaf=35,
            max_depth=27
        )

        rgf = RGFRegressor(
            max_leaf=1998,
            algorithm="RGF",
            test_interval=100,
            loss="LS",
            verbose=False,
            l2=0.187,
            min_samples_leaf=1
        )
        from sklearn import linear_model
        clf = linear_model.LogisticRegression(random_state=0)
        ml = LinearRegression(n_jobs=-1)
        [rf,xg,lgbm,rgf,ml]

        #test
        #RMSE 10**1.048481123342563
        #Coor 0.6585531363438192
        #train
        #RMSE 10**0.7063184454700483
        #Coor 0.8793733707424819
        rf = RandomForestRegressor()
        #test
        #RMSE 10**1.109618572731106
        #Coor 0.6198791156669657
        #train
        #0.5647634395040055
        #0.9179571547567322

        os.chdir(r'G:\マイドライブ\Data\tox_predict')
        df = pd.read_csv('metalMACCS.csv').set_index('CAS')
        dftemp = df.drop(['daphnia_tox','Algae_tox'], axis=1)
        dftemp = dftemp.dropna()
        y = np.log10(dftemp['fish_tox'])
        X = dftemp.iloc[:,0:-2]
        for i in [rf,xg,lgbm,rgf,ml]:
            calcACC(i, X=X, name=i, y=y)
Esempio n. 33
0
# Criação do Dicionário de Modelos

models_reg = {}
models_reg["LinearR"] = LinearRegression()
models_reg["Lasso"] = Lasso(random_state=seed_reg)
models_reg["LassoLarsIC"] = LassoLarsIC()
models_reg["Ridge"] = Ridge(random_state=seed_reg)
models_reg["KernelRidge"] = KernelRidge()
models_reg["BayesianRidge"] = BayesianRidge()
models_reg["ElasticNet"] = ElasticNet(random_state=seed_reg)
models_reg["KNN"] = KNeighborsRegressor()
models_reg["SVR"] = SVR()
models_reg["DecisionTree"] = DecisionTreeRegressor(random_state=seed_reg)
models_reg["ExtraTrees"] = ExtraTreesRegressor(random_state=seed_reg)
models_reg["Earth"] = Earth()
models_reg["RGFRegressor"] = RGFRegressor()
#models_reg["FastRGFRegressor"] = FastRGFRegressor()
models_reg["RandomForest"] = RandomForestRegressor(random_state=seed_reg)
models_reg["AdaBoost"] = AdaBoostRegressor(random_state=seed_reg)
models_reg["GradientBoost"] = GradientBoostingRegressor(random_state=seed_reg)
models_reg["XGBoost"] = XGBRegressor(random_state=seed_reg)
models_reg["LightGBM"] = LGBMRegressor(random_state=seed_reg)
models_reg["CatBoost"] = CatBoostRegressor(random_state=seed_reg)
models_reg["MLPRegressor"] = MLPRegressor(random_state=seed_reg)

#TESTE03-1: Treino e Teste dos Modelos com CrossValScore + 10 KFold(shuffle = False) + R2 Score - X_reg, Y_reg

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score
Esempio n. 34
0
if __name__ == '__main__':
    models=[ 
            ######## First level ########
            [
                #AdaBoostRegressor(RandomForestRegressor(n_estimators=100,criterion='mse'),n_estimators=8, learning_rate= 0.05),
                #LinearRegression(fit_intercept=True, normalize= True),
                #Ridge(alpha=0.1, fit_intercept=True, normalize=False),
                #BaggingRegressor(RandomForestRegressor(n_estimators=100, criterion="mae"), oob_score= False, n_estimators= 20, max_samples= 0.5, max_features= 0.7),
                #SVR(max_iter=-1, degree=5, kernel='rbf'),
                #KNeighborsRegressor(n_neighbors=5),
                #MLPRegressor(early_stopping=True, hidden_layer_sizes=(2), learning_rate_init= 0.01, max_iter= 1000),
                XGBRegressor(n_estimators=100, criterion="mae", max_depth=12, subsample=0.5, learning_rate=0.05, colsample_bytree=0.9),
                #ExtraTreesRegressor(n_estimators=120, criterion="mae", max_depth=10, max_features=0.5, random_state=1),
                GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=50, max_features=0.5, random_state=1),
                RandomForestRegressor(n_estimators=100, criterion="mae", random_state=1),
                RGFRegressor(max_leaf=4500, algorithm="RGF_Sib", test_interval=50, loss="LS", verbose=False),
            ],
            ######## Second level ########
            [
            #    RandomForestRegressor(n_estimators=100, criterion="mae", random_state=1),
                RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005,oob_score=True)
                # RGFRegressor(max_leaf=4500, algorithm="RGF_Sib", test_interval=50, loss="LS", verbose=False),
            #    ExtraTreesRegressor(n_estimators=100, criterion="mae", random_state=1)
            ],
            #[
            #    RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005,oob_score=True),
            #]
        ]

    model = StackNetRegressor(models, metric="mae", folds=5, restacking=True, use_retraining=False, random_state=12345, verbose=1)
Esempio n. 35
0
    def metalPredict(self):
        rf = RandomForestRegressor(
            max_depth=20, random_state=0,
            n_estimators=134,min_samples_split=9,
            max_features=0.33
        )
        xg=xgboost.XGBRegressor(
            n_estimators=100,
            max_depth=7,
            gamma=0.0,
            colsample_bytree=1,
        )

        lgbm = LGBMRegressor(
            boosting_type='gbdt',
            num_leaves=662,
            feature_fraction= 0.36875625810601625,
            bagging_fraction=0.39668072810414723,
            learning_rate=0.06,
            min_data_in_leaf=35,
            max_depth=27
        )

        rgf = RGFRegressor(
            max_leaf=1998,
            algorithm="RGF",
            test_interval=100,
            loss="LS",
            verbose=False,
            l2=0.187,
            min_samples_leaf=1
        )

        os.chdir(r'G:\マイドライブ\Data\tox_predict')
        #test
        #RMSE 10**1.048481123342563
        #Coor 0.6585531363438192
        #train
        #RMSE 10**0.7063184454700483
        #Coor 0.8793733707424819
        rf = RandomForestRegressor()
        #test
        #RMSE 10**1.109618572731106
        #Coor 0.6198791156669657
        #train
        #0.5647634395040055
        #0.9179571547567322


        df1 = pd.read_csv('metalMACCS.csv').set_index('CAS')
        df2 = pd.read_csv('metalECFP4.csv').set_index('CAS')
        for i,df in enumerate([df1,df2]) :
            names = ['fish_tox','daphnia_tox','Algae_tox']
            if i == 0:
                print('MACCS')
            else:
                print('ECFP4')
            for name in names:
                if name == 'fish_tox':
                    dftemp = df.drop(['daphnia_tox','Algae_tox'], axis=1)
                elif name== 'daphnia_tox':
                    dftemp = df.drop(['fish_tox','Algae_tox'], axis=1)
                elif name== 'Algae_tox':
                    dftemp = df.drop(['fish_tox','daphnia_tox'], axis=1)
                print(name)
                dftemp = dftemp.dropna()
                y = np.log10(dftemp[name])
                X = dftemp.iloc[:,0:-1]
                calcACC(rf, X=X, name=None, y=y)