def _transform(self, X): """ Returns ---------- X: same type as X, with squished values """ self._type_check(X) # HACK: v stack median, mad try: X = np.vstack( [ force_array(X), self.median, self.mad ] ) except ValueError: X = np.hstack( [ force_array(X), self.median, self.mad ] ) X = np.apply_along_axis(func1d=self._squish, axis=0, arr=X) if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx) return X
def predict_proba(self, X): check_is_fitted(self, 'model_dict') # NOTE: let's say we respect dataframe if not isinstance(X, (pd.DataFrame, pd.Series)): X = pd.DataFrame(force_array(X)) # predict on dispatcher and get group group_new = self.dispatcher.predict(X) group_new = self.le.transform(group_new) index_dict = \ { group: np.where(group_new == group)[0] for group in self.unique_groups } # predict by group proba_dfs = [] for (group, index) in index_dict.items(): if len(index): df_proba = pd.DataFrame(self.model_dict[group].prodict_proba( X.iloc[index]), index=index) proba_dfs.append(df_proba) # concat all prodictions into one dataframe df_proba = pd.concat(proba_dfs) return force_array(df_proba.sort_index())
def ts_train_test_split(X=None, y=None, groups=None, train_window=None, buff_window=380, test_window=365): """ Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups (dates) : array-like, index, shape (n_samples, ) with datetime type train_window : int, default = 730 (days) The number of days in the training window buff_window : int, default = 380 (days) The number of days to skip between the end of the training window and the start of the testing window. test_window : int, default = 365 (days) The number of days in the testing window Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ # check group if groups is None: raise ValueError('You have to pass in groups') if X is not None: check_consistent_length(force_array(X), force_array(groups)) # get min/max date dates = groups # min_d = min(dates) max_d = max(dates) # get test start date: test_start = max_d - timedelta(days=test_window) # get train end date train_end = test_start - timedelta(days=buff_window) # get train start date if train_window is None: train_start = min(dates) else: # train_window is given train_start = train_end - timedelta(days=train_window) # get id train_start_id = np.searchsorted(a=dates, v=train_start, side='left') train_end_id = np.searchsorted(a=dates, v=train_end, side='right') test_start_id = np.searchsorted(a=dates, v=test_start, side='left') end_id = np.searchsorted(a=dates, v=max_d, side='right') # train, test list train = np.arange(start=train_start_id, stop=train_end_id, step=1) test = np.arange(start=test_start_id, stop=end_id, step=1) return [tuple((train, test))]
def mean_absolute_percentage_error(y_true, y_pred, robust=False): """mean_absolute_percentage_error Use case: y is expressed in percent and we want to take pct into account Formula: mean_absolute_percentage_error = \ mean(abs((y_true - y_pred) / y_true) * 100 Parameters ---------- y_true : array-like of shape = (n_samples) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs) Estimated target values. robust : bool, if True, use median, otherwise, mean Default is False Returns ------- loss : float or ndarray of floats A non-negative floating point value (the best value is 0.0) """ y_true = force_array(y_true) y_pred = force_array(y_pred) if robust: loss = np.median(np.abs((y_true - y_pred) / y_true)) * 100 else: # use mean loss = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 return loss
def _transform(self, X): """ Returns ------- X: array-like, type and shape are preserved after truncating outliers and replacing them with quantile values """ self._type_check(X) # HACK: v stack median, threshole, hi, lo try: X = np.vstack( [ force_array(X), self.median, self.outliers_thres, self.hi_replacement, self.lo_replacement ] ) except ValueError: X = np.hstack( [ force_array(X), self.median, self.outliers_thres, self.hi_replacement, self.lo_replacement ] ) X = np.apply_along_axis(func1d=self._replace, axis=0, arr=X) if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx) return X
def scatterplot(self, x, y, group=None, legend_name=None): """plot scatter plot Parameters ---------- x: a vector, x is one column(feature) of a data y: a vector, y is another column(feature) of a data group: a vector, group is the label of data, it should have the same length as x, y legend_name: a name, or a dictionary of names, match the number of unique values in group eg. { label_one: 'type one', label_two: 'type two' } Returns ------- A renderable plot """ # check lenth x = force_array(x) y = force_array(y) check_consistent_length(x, y) if group is not None: group = force_array(group) check_consistent_length(x, y, group) # get all unique values from group unique_groups = np.unique(group) # check other args if legend_name is not None: check_consistent_length(unique_groups, list(legend_name)) elif legend_name is None: legend_name = {v: v for v in unique_groups} # store data data = [] for grp in unique_groups: data.append( go.Scattergl(x=x[group == grp], y=y[group == grp], name=legend_name[grp], mode='markers')) elif group is None: trace = go.Scattergl(x=x, y=y, mode='markers') data = [trace] layout = go.Layout(title=self.title, yaxis=self.yaxis, xaxis=self.xaxis, width=self.width, height=self.height, margin=go.layout.Margin(t=65, b=60, pad=4)) fig = go.Figure(data=data, layout=layout) plotly.offline.iplot(fig)
def _select_top_and_bottom(y_true, y_score, percentile=10, interpolation='midpoint'): """ Select truth values, predictions, scores of the top and bottom observations Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, ] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, 2] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. percentile: float, default 10 (10% quantile) 0 <= percentile <= 100, the top and bottom quantile(s) to select from all true values interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} New in version 0.18.0. This optional parameter specifies the interpolation method to use,\ when the desired quantile lies between two data points i and j: linear: i + (j - i) * fraction, where fraction is the fractional part\ of the index surrounded by i and j. lower: i. higher: j. nearest: i or j whichever is nearest. midpoint: (i + j) / 2. Returns ------- y_true_ext : array, shape = [n_samples] or [n_samples, ] True binary labels in binary label indicators of top and bottom y_score_ext : array, shape = [n_samples] or [n_samples, 2] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions of top and bottom. y_pred_ext : array, shape = [n_samples] or [n_samples, ] Target prediction, can either be 1 or 0, top is always 1 and bottom\ is always 0. """ y_true = force_array(y_true) y_score = force_array(y_score) upperQ = np.percentile(y_score[:, 1], q=(100 - percentile), interpolation=interpolation) lowerQ = np.percentile(y_score[:, 1], q=percentile, interpolation=interpolation) top_bottom_filter = (y_score[:, 1] >= upperQ) | (y_score[:, 1] <= lowerQ) y_true_ext = y_true[top_bottom_filter] y_score_ext = y_score[top_bottom_filter] y_pred_ext = y_score_ext[:, 1] >= 0.5 return y_true_ext, y_score_ext, y_pred_ext
def fit(self, X, y): """Pipeline: RFECV, standardizer, standardizer, PCA Parameters ---------- X: array-like, shape [n_samples, n_features] y: array, shape[n_samples,]. """ if self.copy: X, y = X.copy(), y.copy() X, y = force_array(X), force_array(y) self._fit(X, y) return self
def split(self, X=None, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups (dates) : array-like, index, shape (n_samples, ) with datetime type NOTE: dates have to be sorted (ascending) Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ # check group if groups is None: raise ValueError('You have to pass in groups') if X is not None: check_consistent_length(force_array(X), force_array(groups)) # get dates dates = groups tot_len = self.train_window + self.buff_window + self.test_window min_d = min(dates) max_d = max(dates) day_shift = ((max_d - min_d).days - tot_len)/(self.n_splits - 1) starts = \ [min_d + timedelta(days=day_shift*i) for i in range(self.n_splits)] for start in starts: end_train, start_test, end = self._return_winds(start) # get id start_id = np.searchsorted(a=dates, v=start, side='left') end_train_id = np.searchsorted(a=dates, v=end_train, side='right') start_test_id = np.searchsorted(a=dates, v=start_test, side='left') end_id = np.searchsorted(a=dates, v=end, side='right') yield ( np.arange(start=start_id, stop=end_train_id, step=1), # noqa np.arange(start=start_test_id, stop=end_id, step=1)) # noqa
def transform(self, X): """Merge reserved df with PCA Parameters ---------- X: array-like, shape [n_samples, n_features] Returns ------- X : new array with dimension reduction, shape [n_samples, n_features] """ check_is_fitted(self, 'pca') if self.copy: X = X.copy() X = force_array(X) # implement RFECV transform method X_reserved = X[:, self.rfecv.support_] if sum(self.cols_for_pca) > 1: # converted X_pca = \ self.pca.transform( self.standardizer.fit_transform(X[:, self.cols_for_pca]) ) return np.hstack((X_reserved, X_pca)) else: # speical case: no feature is eliminated return X_reserved
def _fit(self, X, y, *args, **kwargs): """ private method to train n base models for n folds of cv fit method should never get called """ # get list of folds of indices self.folds = list(check_cv(self.cv).split(X, y)) # Paralellization parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) if isinstance(X, pd.DataFrame): if not isinstance(y, (pd.Series, pd.DataFrame)): y = pd.DataFrame(y) self.fitted_models = parallel( delayed(fit_model)(model=deepcopy(self.model), X=X.iloc[train], y=y.iloc[train], *args, **kwargs) for train, test in self.folds) else: # X is not a dataframe self.fitted_models = parallel( delayed(fit_model)(model=deepcopy(self.model), X=X[train], y=force_array(y)[train], *args, **kwargs) for train, test in self.folds) # train model with full 100% data if self.full_train: self.full_fitted_model = fit_model(model=deepcopy(self.model), X=X, y=y, *args, **kwargs)
def transform(self, X): """ Returns ---------- X: same type as X, with all finite values """ self._type_check(X) # remove inf and fillna X = force_array( pd.DataFrame(force_array(X)) .replace([-np.inf, np.inf], np.nan) .fillna(np.float32(self.fill)) ) if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx) return X
def fit(self, X, y=None): """ Compute median, top, and bottom quantile replacement value for each column """ # Reset internal state before fitting self._reset() # compute median, outliers threshole, hi, lo self.median = np.median(a=force_array(X), axis=0) self.outliers_thres = np.apply_along_axis( func1d=self._get_outliers_thres, axis=0, arr=force_array(X) ) self.hi_replacement = np.percentile(a=force_array(X), q=self.hi, axis=0) # noqa self.lo_replacement = np.percentile(a=force_array(X), q=self.lo, axis=0) # noqa return self
def transform(self, X, y=None): self._type_check(X, y) X = np.apply_along_axis(func1d=self._transform_per_feature, axis=0, arr=force_array(X)) if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx) return X
def transform(self, X, *args, **kwargs): if self.proba: y_hat = self.model.predict_proba(X, *args, **kwargs)[:, 1] else: y_hat = self.model.predict(X, *args, **kwargs) # reshape 1d array for horizontal merge in feature union y_hat = force_array(y_hat) y_hat = np.reshape(y_hat, newshape=(y_hat.shape[0], 1)) return y_hat
def transform(self, X, y=None): check_is_fitted(self, 'features_selected') self._type_check(X, y) X = force_array(X)[:, self.features_selected] if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols[self.support_], index=self.df_idx) return X
def fit(self, X, y=None): """ Compute median and mad for each column. Assuming data is inf-free """ # Reset internal state before fitting self._reset() # compute median, outliers threshole, hi, lo self.median = np.apply_along_axis( func1d=self._calc_median_abs_deviation, axis=0, arr=force_array(X) )[0] self.mad = np.apply_along_axis( func1d=self._calc_median_abs_deviation, axis=0, arr=force_array(X) )[1] return self
def fit(self, X, y=None, *args, **kwargs): # NOTE: let's say we respect dataframe if not isinstance(X, (pd.DataFrame, pd.Series)): X = pd.DataFrame(force_array(X)) if not isinstance(y, (pd.DataFrame, pd.Series)): y = pd.DataFrame(force_array(y)) # First, fit dispatcher and get group if self.supervise_cutoff is None: self._fit_unsupervise(X, *args, **kwargs) else: # supervise self._fit_supervise(X, y, *args, **kwargs) # Second, fit Label encoder self.le = LabelEncoder().fit(self.group) self.group = self.le.transform(self.group) self.unique_groups = np.unique(self.group) # Third, get a model dictionary for two class of data self.model_dict = \ { group: self.model_list[i][-1] for i, group in enumerate(self.unique_groups) } # Nest, get list of index index_dict = \ { group: np.where(self.group == group)[0] for group in self.unique_groups } # Paralization and fit downstream models parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) func = delayed(fit_model) fitted_model_list = parallel( func(self.model_dict[group], X.iloc[index], y.iloc[index]) for (group, index) in index_dict.items()) # update models fitted_model_list = iter(fitted_model_list) self.model_dict = { group: next(fitted_model_list) for group in self.unique_groups } return self
def transform(self, X, y=None): self._type_check(X, y) X = force_array(X) # map col name to index if X is dataframe if self.is_dataframe: self.col_thres_dict = { np.where(self.df_cols == key)[0][0]: value for key, value in self.col_thres_dict.items() } # handle single vector X if X.ndim == 1: X = self._label_encode(X, self.col_thres_dict[0]) else: # for 2d array # iter thru columns for key, threshold in self.col_thres_dict.items(): X[:, key] = self._label_encode(X[:, key], threshold) # finalized if self.is_dataframe: X = pd.DataFrame(X, columns=self.df_cols, index=self.df_idx) return X
def _fit(self, X, y, *args, **kwargs): """ private method to train n base models for last fold of cv """ # get list of folds of indices self.last_fold = list(check_cv(self.cv).split(X, y))[-1] self.in_fold = self.last_fold[0] self.out_of_fold = self.last_fold[-1] # Paralellization parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) if isinstance(X, pd.DataFrame): if not isinstance(y, (pd.Series, pd.DataFrame)): y = pd.DataFrame(y) self.fitted_models = parallel(delayed(fit_model)( model=deepcopy(model), X=X.iloc[self.in_fold], y=y.iloc[self.in_fold], *args, **kwargs ) for (_, model) in self.base_models ) else: # X is not a dataframe self.fitted_models = parallel(delayed(fit_model)( model=deepcopy(model), X=X[self.in_fold], y=force_array(y)[self.in_fold], *args, **kwargs ) for (_, model) in self.base_models ) # train model with full 100% data if self.full_train: self.full_fitted_models = parallel(delayed(fit_model)( model=deepcopy(model), X=X, y=y, *args, **kwargs ) for (_, model) in self.base_models )
def fit(self, X, y=None): self._type_check(X, y) X = force_array(X) # data transformation and standardization if self.transformation == 'auto': transformer_pipe = Pipeline([ ('inf_handler', InfHandler(strategy='max', refit=True)), ('imputer', Imputer(missing_values="NaN", strategy='mean')), ('normalize_transformer', NormalizeTransformer( skewness_threshold=self.skewness_threshold, abs_deviations_threshold=self.abs_deviations_threshold, k=self.k)), ('standard_scaler', StandardScaler()) ]) X = transformer_pipe.fit_transform(X) # Initialization n_features = X.shape[1] support_ = np.ones(n_features, dtype=np.bool) features = np.arange(n_features)[support_] # Elimination # first pass vifs = np.array( [variance_inflation_factor(X, i) for i in range(n_features)]) # recursive pruning while vifs.max() > self.vif_threshold: # there is at least one feature # remaining features support_[features[np.argsort(vifs)[-1]]] = False features = np.arange(n_features)[support_] # get vifs for the remaining features vifs = np.array([ variance_inflation_factor(X[:, features], i) for i in range(X[:, features].shape[1]) ]) # set final attributes self.support_ = support_ self.n_features_ = support_.sum() self.features = np.arange(n_features)[support_] self.vifs = vifs return self
def transform(self, X): """Return final transformed df_X Parameters ---------- X: array-like, shape [n_samples, n_features] Returns ------- X : new array with dimension reduction, shape [n_samples, n_features] """ check_is_fitted(self, 'rfecv') if self.copy: X = X.copy() X = force_array(X) # first step - PCA transformation X_pca = self.pca.transform(self.standardizer.fit_transform(X)) if self.pca.n_components_ > 1: X_pruned = self.rfecv.transform(X_pca) return X_pruned else: return X_pca
def fit_transform(self, X, y, *args, **kwargs): """ fit_transform method gets called when the ensemble is fitted to data It implements _fit to fit base models for different folds and output out-of-sample predictions """ # call _fit self._fit(X, y, *args, **kwargs) # generate out-of-sample predictions and reserve same order!! proba_dfs = [] if isinstance(X, pd.DataFrame): for i, (train, test) in enumerate(self.folds): df_proba = pd.DataFrame( { 'proba': self.fitted_models[i].predict_proba(X.iloc[test])[:, 1] }, # noqa index=test) proba_dfs.append(df_proba) else: # X is not a dataframe for i, (train, test) in enumerate(self.folds): df_proba = pd.DataFrame( { 'proba': self.fitted_models[i].predict_proba( X[test])[:, 1] }, # noqa index=test) proba_dfs.append(df_proba) # concat dfs and revert to origin order df_pred = pd.concat(proba_dfs).sort_index() # get y_out_of_sample y_out_of_sample = force_array(df_pred).reshape((len(df_pred), 1)) # if need to convert to predict if not self.proba: y_out_of_sample = y_out_of_sample > 0.5 return y_out_of_sample
def fit(self, X, y=None): """ Compute inf-free, nan-free, max, min, mean, and median for each column Parameters ---------- X: array-like, it only allows int, float, and bool """ # Reset internal state before fitting self._reset() # make X inf-free X = force_array( pd.DataFrame(force_array(X)).replace([-np.inf, np.inf], np.nan) ) # HACK: for dealing with bool X = np.asarray(X, dtype='float32') # compute inf-free, nan-free, max, min, mean, and median for cols self.max = np.nanmax(a=force_array(X), axis=0) self.min = np.nanmin(a=force_array(X), axis=0) self.mean = np.nanmean(a=force_array(X), axis=0) self.median = np.nanmedian(a=force_array(X), axis=0) return self
def predict(self, X, *args, **kwargs): df_proba = self.predict_proba(X, *args, **kwargs)[:, 1] df_pred = df_proba > 0.5 return force_array(df_pred)
def _base_model_cross_val(model, X, y, cv=None, proba=True, *args, **kwargs): """ A private function that trains each base model for each fold and outputs fitted base models, its out-of-fold predictions, and array of y (in same order of out-of-fold predictions) for fitting ensembler Parameters ---------- model : object, base model X : array-like, or dataframe y : array-like, or dataframe cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. proba : bool, if True, model will implement predict_proba when it gets called Returns ------- list of fitted model for each fold, Xt(out-of-fold pred), y(matched with Xt) """ # get list of folds of indices all_folds = list(check_cv(cv).split(X, y)) # check data type if not isinstance(X, (pd.DataFrame, pd.Series)): X = pd.DataFrame(force_array(X)) if not isinstance(y, (pd.DataFrame, pd.Series)): y = pd.DataFrame(force_array(y)) # iterate each train-fold and fit base model fitted_models = [ fit_model( model=deepcopy(model), X=X.iloc[train], y=y.iloc[train], *args, **kwargs ) for train, test in all_folds ] # generate out-of-sample predictions and reserve same order!! proba_dfs = [] for i, (train, test) in enumerate(all_folds): df_proba = pd.DataFrame( {'proba': fitted_models[i].predict_proba(X.iloc[test])[:, 1]}, # noqa index=test ) proba_dfs.append(df_proba) # concat dfs, sort index, and record index df_out_of_sample = pd.concat(proba_dfs).sort_index() idx = df_out_of_sample.index.values # get pred_out_of_sample pred_out_of_sample = \ force_array(df_out_of_sample).reshape((len(df_out_of_sample), 1)) # if need to convert to predict if not proba: pred_out_of_sample = pred_out_of_sample > 0.5 # get y matched with pred_out_of_sample y_out_of_sample = y.iloc[idx] return fitted_models, pred_out_of_sample, y_out_of_sample
def evaluate(self, X=None, y=None, kind='prediction', scoring=None, aggregator=None, **score_kwargs): """ This is a convenient method for quick evaluating out-of-sample scores Parameters ---------- X : X is NOT required y : y has to be the same y passed in its train method kind : str, one of ['prediction', 'proba']. If 'prediction' is chosen, then it will score prediction against out of sample targets If 'proba' is chosen, then it will score proba against out of sample targets scoring : dictionary with {metrics name: metrics callable} eg. {'accuracy': sklearn.metrics.accuracy_score} Default is accuracy aggregator: a function or a callable, to aggregate a vector **score_kwargs : this is passed to metrics callable Returns ------- score_dict : a dictionary of score eg. { 'accuracy': [0.84, 0.92, 0.86, 0.78], 'roc_auc': [0.72, 0.77, 0.73, 0.69] } """ allowed_kind = ['prediction', 'proba'] if kind not in allowed_kind: raise ValueError('kind must be one of {}'.format(allowed_kind)) if kind == 'prediction': check_has_set_attr(self, 'preds_dict') y_hat_dict = self.preds_dict else: # kind == 'proba' check_has_set_attr(self, 'probas_dict') y_hat_dict = self.probas_dict for i, y_probas in y_hat_dict.items(): if np.dim(y_probas) == 2: y_hat_dict[i] = y_probas[:, -1] # check y if y is None: raise ValueError('You must pass in y') else: y = force_array(y) # check scoring if scoring is None: scoring = {'accuracy': accuracy_score} # score out of sample score_dict = {} for name, score in scoring.items(): # get scores for every folds scores_list = [ score(y[self.cv[i][1]], y_hat_dict[i], **score_kwargs) for i in range(len(self.cv)) ] # save scores with score name in score_dict score_dict = { **score_dict, **{name: scores_list} } # aggregator if aggregator: score_dict = { name: aggregator(scores) for (name, scores) in score_dict.items() } return score_dict
def ts_predefined_split(X=None, y=None, groups=None, test_fold=None, train_window=20 * 52, buff_window=1 * 52, test_window=1 * 52): """ Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. test_fold : array-like, a list of timestamps of the beginning of eaching rolling test fold. Default is None, it will take the latest available date, which is end_date - test_window groups (dates) : array-like, index, shape (n_samples, ) with datetime type NOTE: for the time being, it will convert to 'datetime64[D]' we could support 'datetime64[ns]' in the future train_window : int, default = 20 * 52 (weeks) The number of weeks in the training window buff_window : int, default = 1 * 52 (weeks) The number of weeks to skip between the end of the training window and the start of the testing window. test_window : int, default = 1 * 52 (weeks) The number of weeks in the testing window Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ # check group if groups is None: raise ValueError('You have to pass in groups') if X is not None: check_consistent_length(force_array(X), force_array(groups)) # get min/max date dates = np.sort(np.array(groups, dtype='datetime64[D]')) min_d = dates[0] max_d = dates[-1] # get test_fold if test_fold is None: test_start = max_d - np.timedelta64(test_window, 'W') test_fold = [test_start] else: # if test_fold is NOT None if not isinstance(test_fold, (list, tuple, np.ndarray, pd.Index)): test_fold = [test_fold] # sort test_fold test_fold = np.sort(np.array(test_fold, dtype='datetime64[D]')) # check the last test fold last_test_start = test_fold[-1] if last_test_start >= max_d: raise ValueError('No testing data available for the last fold! ' 'Please re-enter parameters!') # check the first training fold first_train_end = test_fold[0] - np.timedelta64(buff_window, 'W') if first_train_end <= min_d: raise ValueError('No trainning data available for the first fold! ' 'Please re-enter parameters!') # generate index for rolling window folds for test_start in test_fold: # NOTE: there will be missing one day if it's in Leap Year # missing at 2004-12-31 # get test_end test_end = test_start + np.timedelta64(test_window, 'W') test_end = np.min([max_d, test_end]) # get train_end train_end = test_start - np.timedelta64(buff_window, 'W') # get train_start train_start = train_end - np.timedelta64(train_window, 'W') train_start = np.max([min_d, train_start]) # get id train_start_idx = np.searchsorted(a=dates, v=train_start, side='left') train_end_idx = np.searchsorted(a=dates, v=train_end, side='right') test_start_idx = np.searchsorted(a=dates, v=test_start, side='left') test_end_idx = np.searchsorted(a=dates, v=test_end, side='right') yield (np.arange(start=train_start_idx, stop=train_end_idx, step=1), np.arange(start=test_start_idx, stop=test_end_idx, step=1))
def group_train_test_split(X=None, y=None, groups=None, train_size=None, random_state=None): """ Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, index, shape (n_samples, ) train_size : float, int, or None, default None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to 0.80. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ # check group if groups is None: raise ValueError('You have to pass in groups') if X is not None: check_consistent_length(force_array(X), force_array(groups)) # get unique groups and n_groups groups = force_array(groups) unique_groups = np.unique(groups) n_groups = len(unique_groups) # convert train_size to int if train_size is None: train_size = 0.80 if train_size < 1: train_size = int(train_size * n_groups) # sample groups_train if random_state: np.random.seed(random_state) groups_train = np.random.choice( a=unique_groups, size=train_size, replace=False ) # train, test list train_filter = force_array(pd.DataFrame(groups).isin(groups_train)) test_filter = np.logical_not(train_filter) train = np.where(train_filter)[0] test = np.where(test_filter)[0] return [tuple((train, test))]
def _select_top_and_bottom(y_true, y_score, top=50, bottom=50, interpolation='midpoint'): """ Select truth values, predictions, scores of the top and bottom observations Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, ] True binary labels in binary label indicators. y_score : array, shape = [n_samples, ] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. top, bottom : float, int, or None, default 50. If int, it filters top/bottom n samples If float, it should be between 0.0 and 1.0 and it filters top/bottom x percentage of the each class data interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} New in version 0.18.0. This optional parameter specifies the interpolation method to use,\ when the desired quantile lies between two data points i and j: linear: i + (j - i) * fraction, where fraction is the fractional part\ of the index surrounded by i and j. lower: i. higher: j. nearest: i or j whichever is nearest. midpoint: (i + j) / 2. Returns ------- y_true_ext : array, shape = [n_samples] or [n_samples, ] True binary labels in binary label indicators of top and bottom y_score_ext : array, shape = [n_samples] or [n_samples, 2] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions of top and bottom. y_pred_ext : array, shape = [n_samples] or [n_samples, ] Target prediction, can either be 1 or 0, top is always 1 and bottom\ is always 0. """ # check input check_consistent_length(y_true, y_score) y_true = force_array(y_true) y_score = force_array(y_score) n_samples_class_zero = np.sum(y_score < 0.5) n_samples_class_one = np.sum(y_score >= 0.5) # convert float to int for top and bottom if isinstance(top, float): if not (0 < top <= 1.0): raise ValueError('Warning! top is out of the range (0, 1.0)') top = int(round(top * n_samples_class_one)) if isinstance(bottom, float): if not (0 < bottom <= 1.0): raise ValueError('Warning! bottom is out of the range (0, 1.0)') bottom = int(round(bottom * n_samples_class_zero)) # filter top and bottom top_idx = np.argsort(y_score)[::-1][:top] bottom_idx = np.argsort(y_score)[:bottom] filter_idx = np.sort(np.concatenate([top_idx, bottom_idx])) # filtering y_true_ext = y_true[filter_idx] y_score_ext = y_score[filter_idx] y_pred_ext = y_score_ext >= 0.5 return y_true_ext, y_score_ext, y_pred_ext