Ejemplo n.º 1
0
    def fit(self):
        """
        perform model fitting        
        """

        # initialize
        y_vals = np.zeros((self.train_df.shape[0], ))
        if self.task == "multiclass":
            n_class = len(np.unique(self.train_df[self.target].values))
            oof_pred = np.zeros((self.train_df.shape[0], n_class))
            y_pred = np.zeros((self.test_df.shape[0], n_class))
        else:
            oof_pred = np.zeros((self.train_df.shape[0], ))
            y_pred = np.zeros((self.test_df.shape[0], ))

        # group does not kick in when group k fold is used
        if self.group is not None:
            if self.group in self.features:
                self.features.remove(self.group)
            if self.group in self.categoricals:
                self.categoricals.remove(self.group)
        fi = np.zeros((self.n_splits, len(self.features)))

        # target encoding
        numerical_features = [f for f in self.features if f not in self.categoricals]
        if self.target_encoding:            
            # perform target encoding
            k = 0
            f = 1
            overall_mean = self.train_df[self.target].mean()
            for c in self.categoricals:
                data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values})
                tmp = np.nan * np.ones(self.train_df.shape[0])
                
                cv = self.get_cv()
                for fold, (train_idx, val_idx) in enumerate(cv):
                    # target mean
                    target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean().reset_index() 
                    
                    # smoothing
                    target_count = data_tmp.iloc[train_idx].groupby(c)['target'].count().reset_index() 
                    target_count['target'] = target_count['target'].apply(lambda x : 1 / (1 + np.exp((-x -k) / f))
                    target_mean = L * target_mean + (1 - L) * overall_mean

                    # allocate
                    tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values
                self.train_df[c] = tmp
                
                # replace categorical variable in test
                target_mean = data_tmp.groupby(c)['target'].mean()
                self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values
            
            # no categoricals any more
            numerical_features = self.features.copy()
            self.categoricals = []
        
        # fill nan
        if self.model not in ['lgb', 'catb', 'xgb']:
            # fill NaN (numerical features -> median, categorical features -> mode)
            self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median())
            self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median())
            self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0])
            self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0])
      
        # scaling, if necessary
        if self.scaler is not None:
            # to normal
            pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal")
            self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features])

            # starndardize
            if self.scaler == "MinMax":
                scaler = MinMaxScaler()
            elif self.scaler == "Standard":
                scaler = StandardScaler()
            self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features])

            x_test = self.test_df.copy()
            if self.model == "nn":
                x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]]
            else:
                x_test = x_test[self.features]
        else:
            x_test = self.test_df[self.features]
        
        # fitting with out of fold
        cv = self.get_cv()
        for fold, (train_idx, val_idx) in enumerate(cv):
            # train test split
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx]

            if self.model == "nn":
                x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]]
                x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]]

            # model fitting
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model, importance = self.train_model(train_set, val_set)
            fi[fold, :] = importance
            y_vals[val_idx] = y_val

            # predictions and check cv score
            oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task)
            y_pred += ypred.reshape(y_pred.shape) / self.n_splits
            if self.task == "multiclass":
                oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    np.argmax(oof_pred[val_idx, :], axis=1))))
            else:
                oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    oof_pred[val_idx])))

        # feature importance data frame
        fi_df = pd.DataFrame()
        for n in np.arange(self.n_splits):
            tmp = pd.DataFrame()
            tmp["features"] = self.features
            tmp["importance"] = fi[n, :]
            tmp["fold"] = n
            fi_df = pd.concat([fi_df, tmp], ignore_index=True)
        gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index()
        fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean'))

        # outputs
        if self.task == "multiclass":
            loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1))
        else:
            loss_score = self.calc_metric(y_vals, oof_pred)

        if self.verbose:
            print('Our oof loss score is: ', loss_score)
        return y_pred, loss_score, model, oof_pred, y_vals, fi_df

    def plot_feature_importance(self, rank_range=[1, 50]):
        """
        function for plotting feature importance (nothing is returned when the model is NN)

        :EXAMPLE:
        # fit LGB regression model
        model = RunModel(train_df, test_df, target, features, categoricals=categoricals,
                model="lgb", task="regression", n_splits=4, cv_method="KFold", 
                group=None, seed=1220, scaler=None)
        
        # plot 
        fi_df = model.plot_feature_importance(rank_range=[1, 100])
        
        """
        # plot feature importance
        _, ax = plt.subplots(1, 1, figsize=(10, 20))
        sorted_df = self.fi_df.sort_values(by = "importance_mean", ascending=False).reset_index().iloc[self.n_splits * (rank_range[0]-1) : self.n_splits * rank_range[1]]
        sns.barplot(data=sorted_df, x ="importance", y ="features", orient='h')
        ax.set_xlabel("feature importance")
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        return sorted_df
Ejemplo n.º 2
0
    def fit(self):
        """
        perform model fitting        
        """

        # initialize
        y_vals = np.zeros((self.train_df.shape[0], ))
        if self.task == "multiclass":
            n_class = len(np.unique(self.train_df[self.target].values))
            oof_pred = np.zeros((self.train_df.shape[0], n_class))
            y_pred = np.zeros((self.test_df.shape[0], n_class))
        else:
            oof_pred = np.zeros((self.train_df.shape[0], ))
            y_pred = np.zeros((self.test_df.shape[0], ))

        # group does not kick in when group k fold is used
        if self.group is not None:
            if self.group in self.features:
                self.features.remove(self.group)
            if self.group in self.categoricals:
                self.categoricals.remove(self.group)
        fi = np.zeros((self.n_splits, len(self.features)))

        # target encoding
        numerical_features = [f for f in self.features if f not in self.categoricals]
        if self.target_encoding:            
            # perform target encoding
            overall_mean = self.train_df[self.target].mean()
            for c in self.categoricals:
                data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values})
                tmp = np.nan * np.ones(self.train_df.shape[0])
                
                cv = self.get_cv()
                for fold, (train_idx, val_idx) in enumerate(cv):
                    # target mean
                    target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean() 
                    
                    # smoothing
                    target_count = data_tmp.iloc[train_idx].groupby(c)['target'].count() 
                    n_sigmoid = (1 / 1 + np.exp(-))
                    target_mean = L * target_mean + (1 - L) * overall_mean

                    # allocate
                    tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values
                self.train_df[c] = tmp
                
                # replace categorical variable in test
                target_mean = data_tmp.groupby(c)['target'].mean()
                self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values
            
            # no categoricals any more
            numerical_features = self.features.copy()
            self.categoricals = []
        
        # fill nan
        if self.model not in ['lgb', 'catb', 'xgb']:
            # fill NaN (numerical features -> median, categorical features -> mode)
            self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median())
            self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median())
            self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0])
            self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0])
      
        # scaling, if necessary
        if self.scaler is not None:
            # to normal
            pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal")
            self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features])

            # starndardize
            if self.scaler == "MinMax":
                scaler = MinMaxScaler()
            elif self.scaler == "Standard":
                scaler = StandardScaler()
            self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features])

            x_test = self.test_df.copy()
            if self.model == "nn":
                x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]]
            else:
                x_test = x_test[self.features]
        else:
            x_test = self.test_df[self.features]
        
        # fitting with out of fold
        cv = self.get_cv()
        for fold, (train_idx, val_idx) in enumerate(cv):
            # train test split
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx]

            if self.model == "nn":
                x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]]
                x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]]

            # model fitting
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model, importance = self.train_model(train_set, val_set)
            fi[fold, :] = importance
            y_vals[val_idx] = y_val

            # predictions and check cv score
            oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task)
            y_pred += ypred.reshape(y_pred.shape) / self.n_splits
            if self.task == "multiclass":
                oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    np.argmax(oof_pred[val_idx, :], axis=1))))
            else:
                oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    oof_pred[val_idx])))

        # feature importance data frame
        fi_df = pd.DataFrame()
        for n in np.arange(self.n_splits):
            tmp = pd.DataFrame()
            tmp["features"] = self.features
            tmp["importance"] = fi[n, :]
            tmp["fold"] = n
            fi_df = pd.concat([fi_df, tmp], ignore_index=True)
        gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index()
        fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean'))

        # outputs
        if self.task == "multiclass":
            loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1))
        else:
            loss_score = self.calc_metric(y_vals, oof_pred)

        if self.verbose:
            print('Our oof loss score is: ', loss_score)
        return y_pred, loss_score, model, oof_pred, y_vals, fi_df