Example #1
0
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n))

    yield

    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n_samples))
Example #2
0
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).choice(
            n_samples, n, replace=False))
def jeremy_trick_RF_sample_size(n):
    # Jeremy's trick; hmm.. this won't work as a separate function?
    # def batch_size_for_node_splitting(rs, n_samples):
    #     forest.check_random_state(rs).randint(0, n_samples, 20000)
    # forest._generate_sample_indices = batch_size_for_node_splitting
    forest._generate_sample_indices = \
        (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n))
Example #4
0
def set_rf_samples(n):
    """ Changes Scikit-learn's random forests to give each tree a random sample of
    n random rows. 
    Set oob_error in RandomForestRegressor to False if using this. 
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))
Example #5
0
def jeremy_trick_RF_sample_size(n):
    # Jeremy's trick; hmm.. this won't work as a separate function?
    # def batch_size_for_node_splitting(rs, n_samples):
    #     forest.check_random_state(rs).randint(0, n_samples, 20000)
    # forest._generate_sample_indices = batch_size_for_node_splitting
    forest._generate_sample_indices = \
        (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n))
Example #6
0
def set_random_forest_sample_size(nobs):
    """
    Override default behavior of scikit-learn random forest models to
    sample the same number of rows from the original data with replacement.
    Instead, this forces random forest to fit each tree on a sample of size
    nobs, greatly speeding up the algorithm on large datasets.
    """
    forest.generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, nobs))
Example #7
0
def reset_random_forest_sample_size():
    """
    Restore the default behavior of scikit-learn random forest models to
    sample the same number of rows from the original data with replacement.
    This should be used if `set_random_forest_sample_size()` had been 
    previously run. 
    """
    forest.generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n_samples))
def _generate_sample_indices_Peter(random_state, n_samples,n_bootstrap=None , y=None):
    
    """Private function used to _parallel_build_trees function."""
    random_instance = check_random_state(random_state)
    if n_bootstrap is None:
        sample_indices = random_instance.randint(0, n_samples, n_samples)
    else:
        if y is None:
            sample_indices = random_instance.randint(0, n_samples, n_bootstrap)
        else:
            sample_indices_y_0 = np.where( y == 0 )[0]
            sample_indices_y_1 = np.where( y != 0 )[0]
            xxx = list(sample_indices_y_0)
            yyy = list(sample_indices_y_1)
            random.shuffle(xxx)
            random.shuffle(yyy)
            sample_indices_y_0 = np.array(xxx)[0:(n_bootstrap/2)]
            sample_indices_y_1 = np.array(yyy)[0:(n_bootstrap/2)]

            sample_indices = np.array( list(sample_indices_y_0) + list(sample_indices_y_1) )
            random.shuffle(sample_indices)
    return sample_indices
Example #9
0
def reset_rf_samples():
    """ Undoes the changes produced by set_rf_samples.
    """
    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n_samples))
def jeremy_trick_reset_RF_sample_size():
    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n_samples))
Example #11
0
def reset_rf_samples():
    forest._generate_sample_indices = (
        lambda rs, n_samples: forest.check_random_state(rs).randint(
            0, n_samples, n_samples))
Example #12
0
def reset_rf_samples():
    """ Undoes the changes produced by set_rf_samples.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))
Example #13
0
def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))
Example #14
0
def jeremy_trick_reset_RF_sample_size():
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))
Example #15
0
def rf_reg_crossval(df, y, df_ts, n_months, n_days, n_est, min_sam_leaf):

    n_folds = int(n_months * 30.0 / n_days)
    errors = []
    fis = []
    first_dt_train = df.date.min(
    )  # First date available in the training set (train.csv)

    #Setting seed
    np.random.seed(9001)

    #Sequentially split available training dataset into train and validation sets
    for i in range(1, n_folds):
        print('*____________________________________*')
        print('running fold # ', i, ' of ', n_folds)

        forest._generate_sample_indices = (
            lambda rs, n_samples: forest.check_random_state(rs).randint(
                0, n_samples, 1000000))

        # Initialize random forest
        m_kcv = RandomForestRegressor(n_estimators=n_est,
                                      max_features=0.5,
                                      min_samples_leaf=min_sam_leaf,
                                      n_jobs=-1,
                                      oob_score=False)

        # Getting dates for training and validation sets
        train_sub_startdt = first_dt_train + relativedelta(days=i * n_days)
        valid_sub_startdt = first_dt_train + relativedelta(days=(i + 1) *
                                                           n_days)

        # Create indices for training and validation sets
        index_train = sorted(
            df.index[df['date'] == train_sub_startdt].tolist())[0]
        index_valid = sorted(
            df.index[df['date'] == valid_sub_startdt].tolist())[0]

        # Create subsetted dataframes of X,y, and w for training and validation sets
        X_train, X_valid = split_vals(df.loc[:, df.columns != 'date'],
                                      index_train, index_valid)
        y_train, y_valid = split_vals(y, index_train, index_valid)

        print('X train shape:      ', X_train.shape)
        print('Y train shape:      ', y_train.shape)
        print('X validation shape: ', X_valid.shape)
        print('Y validation shape: ', y_valid.shape)

        # Compute arrays of item score weights for the items in the validation
        # set (for which we will make predictions)
        item_weight_train = 1 + X_train['perishable'] * 0.25
        item_weight_valid = 1 + X_valid['perishable'] * 0.25

        # Optimizing the model fit by converting it to float array outside
        X_train = np.array(X_train, dtype=np.float32)

        # Fit the random forest model on training set w/ cross validation
        m_kcv.fit(X_train, y_train)

        # Print the NWRMSLE score and R-squared values for training and validation sets
        train_score = prediction_score(m_kcv,
                                       X_train,
                                       y_train,
                                       item_weight_train,
                                       plot_pre=False)
        val_score = prediction_score(m_kcv,
                                     X_valid,
                                     y_valid,
                                     item_weight_valid,
                                     plot_pre=False)
        print('For training set:   [nwrmsle, rsquared]: ', train_score)
        print('For validation set: [nwrmsle, rsquared]: ', val_score)
        print('\n')

        # Add errors to the errors list
        errors.append([train_score, val_score])

        # Feature importance
        fi = pd.DataFrame({
            'col_names': df.loc[:, df.columns != 'date'].columns,
            'feature_imp': m_kcv.feature_importances_
        }).sort_values('feature_imp', ascending=False)
        print(fi[:10])
        fis.append(fi[:10])

        # Reducing number of features
        to_keep = fi[fi.feature_imp > 0.005].col_names
        # we nedd to keep perishable for scoring
        df_keep = df[to_keep].copy()

        # Training on training set again
        # Create subsetted dataframes of X,y, and w for training and validation sets
        X_train, X_valid = split_vals(
            df_keep.loc[:, df_keep.columns != 'date'], index_train,
            index_valid)
        y_train, y_valid = split_vals(y, index_train, index_valid)

        print('\nTraining again on selected features')
        print('X train shape:      ', X_train.shape)
        print('Y train shape:      ', y_train.shape)
        print('X validation shape: ', X_valid.shape)
        print('Y validation shape: ', y_valid.shape)

        # Compute arrays of item score weights for the items in the validation set (for which we will make predictions)
        item_weight_train = 1 + X_train['perishable'] * 0.25
        item_weight_valid = 1 + X_valid['perishable'] * 0.25

        # Optimizing the model fit by converting it to float array outside
        X_train = np.array(X_train, dtype=np.float32)

        # Fit the random forest model on training set w/ cross validation
        m_kcv.fit(X_train, y_train)

        # Print the NWRMSLE score and R-squared values for training and validation sets
        train_score = prediction_score(m_kcv,
                                       X_train,
                                       y_train,
                                       item_weight_train,
                                       plot_pre=False)
        val_score = prediction_score(m_kcv,
                                     X_valid,
                                     y_valid,
                                     item_weight_valid,
                                     plot_pre=True)
        print(
            'For training set after feature selection  : [nwrmsle, rsquared]: ',
            train_score)
        print(
            'For validation set after feature selection: [nwrmsle, rsquared]: ',
            val_score)

        # Add errors to the errors list
        errors.append([train_score, val_score])

        # Train on entire training set
        # Initialize random forest
        m_kcv_new = RandomForestRegressor(n_estimators=n_est,
                                          max_features=0.5,
                                          min_samples_leaf=min_sam_leaf,
                                          n_jobs=-1,
                                          oob_score=False)

        X_train_new = df_keep.loc[:, df_keep.columns != 'date'][:index_valid]
        y_train_new = y[:index_valid]

        # Compute arrays of item score weights for the items in the training
        # set (for which we will make predictions)
        item_weight_train_new = 1 + X_train_new['perishable'] * 0.25

        # Optimizing the model fit by converting it to float array outside
        X_train_new = np.array(X_train_new, dtype=np.float32)

        print('\nTraining again on whole training set with selected features')
        print('X train shape:      ', X_train_new.shape)
        print('Y train shape:      ', y_train_new.shape)

        # Fit the random forest model on training set w/ cross validation
        m_kcv_new.fit(X_train_new, y_train_new)

        # Print the NWRMSLE score and R-squared values for training and validation sets
        train_score = prediction_score(m_kcv_new,
                                       X_train_new,
                                       y_train_new,
                                       item_weight_train_new,
                                       plot_pre=False)
        print(
            'For whole training set after feature selection  : [nwrmsle, rsquared]: ',
            train_score)

#        #  test set
#        test_keep     = df_ts[to_keep].copy()
#
#        print('\nTesting on test data')
#        #Predict for test set
#        pred_test_log = m_kcv_new.predict(test_keep)
#        pred_test     = np.round(np.expm1(pred_test_log), decimals=0)
#        output        = pd.concat([test_keep['id'], pd.DataFrame(pred_test)], axis=1)
#        output.columns = ['id', 'unit_sales']
#        name           = 'prediction' + str(i) + '.csv'
#        output.to_csv(name, index=False)

# Write errors to file
    with open('errors_rf_reg.txt', 'w') as file:
        file.write(str(errors))

    # Write feature importances to file
    with open('feature_importance_rf_reg.txt', 'w') as file:
        file.write(str(fis))
    def fit(self, X, y, sample_weight=None):
        """Build a forest of trees from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        Returns
        -------
        self : object
            Returns self.
        """
        # Validate or convert input data
        X = check_array(X, dtype=DTYPE, accept_sparse="csc")
        if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
            X.sort_indices()

        # Remap output
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        if y.ndim == 2 and y.shape[1] == 1:
            warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        y, expanded_class_weight = self._validate_y_class_weight(y)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Check parameters
        self._validate_estimator()

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        random_state = check_random_state(self.random_state)

        if not self.warm_start:
            # Free allocated memory, if any
            self.estimators_ = []

        n_more_estimators = self.n_estimators - len(self.estimators_)

        if n_more_estimators < 0:
            raise ValueError('n_estimators=%d must be larger or equal to '
                             'len(estimators_)=%d when warm_start==True'
                             % (self.n_estimators, len(self.estimators_)))

        elif n_more_estimators == 0:
            warn("Warm-start fitting without increasing n_estimators does not "
                 "fit new trees.")
        else:
            if self.warm_start and len(self.estimators_) > 0:
                # We draw from the random state to get the random state we
                # would have got if we hadn't used a warm_start.
                random_state.randint(MAX_INT, size=len(self.estimators_))

            trees = []
            for i in range(n_more_estimators):
                tree = self._make_estimator(append=False)
                tree.set_params(random_state=random_state.randint(MAX_INT))
                trees.append(tree)

            # Parallel loop: we use the threading backend as the Cython code
            # for fitting the trees is internally releasing the Python GIL
            # making threading always more efficient than multiprocessing in
            # that case.
            trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                             backend="threading")(
                delayed(_parallel_build_trees_Peter)(
                    t, self, X, y, sample_weight, i, len(trees),
                    verbose=self.verbose, class_weight=self.class_weight, n_bootstrap=self.n_bootstrap)
                for i, t in enumerate(trees))
            # Collect newly grown trees
            self.estimators_.extend(trees)

        if self.oob_score:
            self._set_oob_score(X, y)

        # Decapsulate classes_ attributes
        if hasattr(self, "classes_") and self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self