def set_rf_samples(n): """ Changes Scikit learn's random forests to give each tree a random sample of n random rows. """ forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n)) yield forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n_samples))
def set_rf_samples(n): """ Changes Scikit learn's random forests to give each tree a random sample of n random rows. """ forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).choice( n_samples, n, replace=False))
def jeremy_trick_RF_sample_size(n): # Jeremy's trick; hmm.. this won't work as a separate function? # def batch_size_for_node_splitting(rs, n_samples): # forest.check_random_state(rs).randint(0, n_samples, 20000) # forest._generate_sample_indices = batch_size_for_node_splitting forest._generate_sample_indices = \ (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n))
def set_rf_samples(n): """ Changes Scikit-learn's random forests to give each tree a random sample of n random rows. Set oob_error in RandomForestRegressor to False if using this. """ forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n))
def set_random_forest_sample_size(nobs): """ Override default behavior of scikit-learn random forest models to sample the same number of rows from the original data with replacement. Instead, this forces random forest to fit each tree on a sample of size nobs, greatly speeding up the algorithm on large datasets. """ forest.generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, nobs))
def reset_random_forest_sample_size(): """ Restore the default behavior of scikit-learn random forest models to sample the same number of rows from the original data with replacement. This should be used if `set_random_forest_sample_size()` had been previously run. """ forest.generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n_samples))
def _generate_sample_indices_Peter(random_state, n_samples,n_bootstrap=None , y=None): """Private function used to _parallel_build_trees function.""" random_instance = check_random_state(random_state) if n_bootstrap is None: sample_indices = random_instance.randint(0, n_samples, n_samples) else: if y is None: sample_indices = random_instance.randint(0, n_samples, n_bootstrap) else: sample_indices_y_0 = np.where( y == 0 )[0] sample_indices_y_1 = np.where( y != 0 )[0] xxx = list(sample_indices_y_0) yyy = list(sample_indices_y_1) random.shuffle(xxx) random.shuffle(yyy) sample_indices_y_0 = np.array(xxx)[0:(n_bootstrap/2)] sample_indices_y_1 = np.array(yyy)[0:(n_bootstrap/2)] sample_indices = np.array( list(sample_indices_y_0) + list(sample_indices_y_1) ) random.shuffle(sample_indices) return sample_indices
def reset_rf_samples(): """ Undoes the changes produced by set_rf_samples. """ forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n_samples))
def jeremy_trick_reset_RF_sample_size(): forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n_samples))
def reset_rf_samples(): forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, n_samples))
def reset_rf_samples(): """ Undoes the changes produced by set_rf_samples. """ forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n_samples))
def set_rf_samples(n): """ Changes Scikit learn's random forests to give each tree a random sample of n random rows. """ forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n))
def jeremy_trick_reset_RF_sample_size(): forest._generate_sample_indices = (lambda rs, n_samples: forest.check_random_state(rs).randint(0, n_samples, n_samples))
def rf_reg_crossval(df, y, df_ts, n_months, n_days, n_est, min_sam_leaf): n_folds = int(n_months * 30.0 / n_days) errors = [] fis = [] first_dt_train = df.date.min( ) # First date available in the training set (train.csv) #Setting seed np.random.seed(9001) #Sequentially split available training dataset into train and validation sets for i in range(1, n_folds): print('*____________________________________*') print('running fold # ', i, ' of ', n_folds) forest._generate_sample_indices = ( lambda rs, n_samples: forest.check_random_state(rs).randint( 0, n_samples, 1000000)) # Initialize random forest m_kcv = RandomForestRegressor(n_estimators=n_est, max_features=0.5, min_samples_leaf=min_sam_leaf, n_jobs=-1, oob_score=False) # Getting dates for training and validation sets train_sub_startdt = first_dt_train + relativedelta(days=i * n_days) valid_sub_startdt = first_dt_train + relativedelta(days=(i + 1) * n_days) # Create indices for training and validation sets index_train = sorted( df.index[df['date'] == train_sub_startdt].tolist())[0] index_valid = sorted( df.index[df['date'] == valid_sub_startdt].tolist())[0] # Create subsetted dataframes of X,y, and w for training and validation sets X_train, X_valid = split_vals(df.loc[:, df.columns != 'date'], index_train, index_valid) y_train, y_valid = split_vals(y, index_train, index_valid) print('X train shape: ', X_train.shape) print('Y train shape: ', y_train.shape) print('X validation shape: ', X_valid.shape) print('Y validation shape: ', y_valid.shape) # Compute arrays of item score weights for the items in the validation # set (for which we will make predictions) item_weight_train = 1 + X_train['perishable'] * 0.25 item_weight_valid = 1 + X_valid['perishable'] * 0.25 # Optimizing the model fit by converting it to float array outside X_train = np.array(X_train, dtype=np.float32) # Fit the random forest model on training set w/ cross validation m_kcv.fit(X_train, y_train) # Print the NWRMSLE score and R-squared values for training and validation sets train_score = prediction_score(m_kcv, X_train, y_train, item_weight_train, plot_pre=False) val_score = prediction_score(m_kcv, X_valid, y_valid, item_weight_valid, plot_pre=False) print('For training set: [nwrmsle, rsquared]: ', train_score) print('For validation set: [nwrmsle, rsquared]: ', val_score) print('\n') # Add errors to the errors list errors.append([train_score, val_score]) # Feature importance fi = pd.DataFrame({ 'col_names': df.loc[:, df.columns != 'date'].columns, 'feature_imp': m_kcv.feature_importances_ }).sort_values('feature_imp', ascending=False) print(fi[:10]) fis.append(fi[:10]) # Reducing number of features to_keep = fi[fi.feature_imp > 0.005].col_names # we nedd to keep perishable for scoring df_keep = df[to_keep].copy() # Training on training set again # Create subsetted dataframes of X,y, and w for training and validation sets X_train, X_valid = split_vals( df_keep.loc[:, df_keep.columns != 'date'], index_train, index_valid) y_train, y_valid = split_vals(y, index_train, index_valid) print('\nTraining again on selected features') print('X train shape: ', X_train.shape) print('Y train shape: ', y_train.shape) print('X validation shape: ', X_valid.shape) print('Y validation shape: ', y_valid.shape) # Compute arrays of item score weights for the items in the validation set (for which we will make predictions) item_weight_train = 1 + X_train['perishable'] * 0.25 item_weight_valid = 1 + X_valid['perishable'] * 0.25 # Optimizing the model fit by converting it to float array outside X_train = np.array(X_train, dtype=np.float32) # Fit the random forest model on training set w/ cross validation m_kcv.fit(X_train, y_train) # Print the NWRMSLE score and R-squared values for training and validation sets train_score = prediction_score(m_kcv, X_train, y_train, item_weight_train, plot_pre=False) val_score = prediction_score(m_kcv, X_valid, y_valid, item_weight_valid, plot_pre=True) print( 'For training set after feature selection : [nwrmsle, rsquared]: ', train_score) print( 'For validation set after feature selection: [nwrmsle, rsquared]: ', val_score) # Add errors to the errors list errors.append([train_score, val_score]) # Train on entire training set # Initialize random forest m_kcv_new = RandomForestRegressor(n_estimators=n_est, max_features=0.5, min_samples_leaf=min_sam_leaf, n_jobs=-1, oob_score=False) X_train_new = df_keep.loc[:, df_keep.columns != 'date'][:index_valid] y_train_new = y[:index_valid] # Compute arrays of item score weights for the items in the training # set (for which we will make predictions) item_weight_train_new = 1 + X_train_new['perishable'] * 0.25 # Optimizing the model fit by converting it to float array outside X_train_new = np.array(X_train_new, dtype=np.float32) print('\nTraining again on whole training set with selected features') print('X train shape: ', X_train_new.shape) print('Y train shape: ', y_train_new.shape) # Fit the random forest model on training set w/ cross validation m_kcv_new.fit(X_train_new, y_train_new) # Print the NWRMSLE score and R-squared values for training and validation sets train_score = prediction_score(m_kcv_new, X_train_new, y_train_new, item_weight_train_new, plot_pre=False) print( 'For whole training set after feature selection : [nwrmsle, rsquared]: ', train_score) # # test set # test_keep = df_ts[to_keep].copy() # # print('\nTesting on test data') # #Predict for test set # pred_test_log = m_kcv_new.predict(test_keep) # pred_test = np.round(np.expm1(pred_test_log), decimals=0) # output = pd.concat([test_keep['id'], pd.DataFrame(pred_test)], axis=1) # output.columns = ['id', 'unit_sales'] # name = 'prediction' + str(i) + '.csv' # output.to_csv(name, index=False) # Write errors to file with open('errors_rf_reg.txt', 'w') as file: file.write(str(errors)) # Write feature importances to file with open('feature_importance_rf_reg.txt', 'w') as file: file.write(str(fis))
def fit(self, X, y, sample_weight=None): """Build a forest of trees from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object Returns self. """ # Validate or convert input data X = check_array(X, dtype=DTYPE, accept_sparse="csc") if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start: # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [] for i in range(n_more_estimators): tree = self._make_estimator(append=False) tree.set_params(random_state=random_state.randint(MAX_INT)) trees.append(tree) # Parallel loop: we use the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading always more efficient than multiprocessing in # that case. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_trees_Peter)( t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_bootstrap=self.n_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self