def all_low_cardinality_to_categorical(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame: """Casts all low cardinality columns to type 'category' """ bounds_check(threshold, 0.0, 1.0) df_to_use = df.copy() transform_fn = lambda x: x.astype("category") n_entre = df_to_use.shape[0] # check to see that the condition actually has object types to convert. if df.select_dtypes(include=["object"]).shape[1] == 0: return df else: # objects = df_to_use.select_dtypes(include=["object"]).nunique() condition = lambda x: (x.select_dtypes(include=["object"]).nunique()[ lambda y: y.div(n_entre).lt(threshold)]).index return _multi_assign(df_to_use, transform_fn, condition)
def learning( df: "MetaPanda", y: str, x: Optional[SelectorType] = None, train_n: Optional[np.ndarray] = None, permute_n: int = 0, cv: Tuple[int, int] = (5, 15), model: str = "LinearRegression", cache: Optional[str] = None, plot: bool = False, verbose: int = 0, **model_kws ): """Fits a basic model to generate cross-validated training/test scores for different training set sizes. A cross-validation generator splits the whole dataset `k` times in training and test data. Subsets of the training set with varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed. Afterwards, the scores will be averaged over all `k` runs for each training subset size. Parameters ---------- df : MetaPanda (n_samples, n_features) The main dataset. y : str A selected y column. x : list/tuple of str/selector, optional A list of selected column names for x or MetaPanda `selector`. train_n : int/array-like, with shape (n_ticks,) dtype float or int, optional Relative or absolute numbers of training examples that will be used to generate learning curve related data. If None: uses `linspace(.1, .9, 8)` If int: uses `linspace(.1, .9, n)` permute_n : int (default 0) The number of times to permute y, if > 0, then does full permutation analysis (making 4th plot) cv : int/tuple, optional (5, 10) If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` model : str/estimator sklearn model that implements `fit` and `predict` methods The name of a scikit-learn model, or the model object itself. cache : str, optional TODO: Not Implemented yet. If not None, stores the resulting model parts in JSON and reloads if present. plot : bool, optional If True, produces `.plot.learning_curve` inplace. verbose : int, optional If > 0, prints out statements depending on level. Other Parameters ---------------- model_kws : dict, optional Keywords to pass to the sklearn model which are not parameterized. Returns ------- results : MetaPanda (n_ticks, 8) The results matrix of mean and std scores permute_ : np.ndarray (permute_n,), optional The permutation scores associated with the permutation analysis Notes ----- Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted. By default, `fit_learning` uses the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. `permute_n` is set to 0 by default, if you want a permutation histogram, this value must be > 0. See Also -------- fit_basic : Performs a rudimentary fit model with no parameter searching. fit_grid : Performs exhaustive grid search analysis on the models selected. References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # perform checks instance_check(df, pd.DataFrame, MetaPanda) instance_check(y, str) instance_check(train_n, (type(None), int, list, tuple, np.ndarray)) instance_check(permute_n, int) instance_check(cv, (int, tuple)) # instance_check(cache, (type(None), str)) instance_check(plot, bool) bounds_check(verbose, 0, 4) # set dataset if a pandas object _df = df.df_ if not isinstance(df, pd.DataFrame) else df # retrieve x columns if none _xcols = select_xcols(_df, x, y) k, repeats = cv if isinstance(cv, tuple) else cv, 1 lm, pkg_name = find_sklearn_model(model, "regression") # assign keywords to lm lm.set_params(**model_kws) if train_n is None: train_n = np.linspace(0.1, 0.9, 8) elif isinstance(train_n, int): train_n = np.linspace(0.1, 0.9, train_n) # ml ready _x, _y = preprocess_continuous_X_y(_df, _xcols, y) rep = RepeatedKFold(n_splits=k, n_repeats=repeats) vars_ = learning_curve( lm, _x, _y, train_sizes=train_n, cv=rep, scoring="neg_root_mean_squared_error", n_jobs=-2, verbose=verbose, return_times=True, ) # permutation analysis if permute_n > 0 if permute_n > 0: perm_score_, perm_scorez_, pval = permutation_test_score( lm, _x, _y, cv=rep, n_permutations=permute_n, scoring="neg_root_mean_squared_error", n_jobs=-2, verbose=verbose, ) # outputs output_labels_ = ["train_score", "test_score", "fit_time", "score_time"] # format as df results = pd.DataFrame( # stack them together np.hstack( ( np.stack([np.mean(vars_[i], axis=1) for i in range(1, 5)], axis=1), np.stack([np.std(vars_[i], axis=1) for i in range(1, 5)], axis=1), ) ), columns=list( it.chain( map(lambda s: "mean_" + s, output_labels_), map(lambda s: "std_" + s, output_labels_), ) ), ) # add N column results["N"] = vars_[0] R = MetaPanda(results) if plot and permute_n > 0: lcurve(R, perm_scorez_) elif plot: lcurve(R) # return as MetaPanda if permute_n > 0: return R, perm_score_, perm_scorez_, pval else: return R
def grid(df: Union[pd.DataFrame, "MetaPanda"], y: str, x: Optional[SelectorType] = None, models=("Ridge", "Lasso"), cv: Union[int, Tuple[int, int]] = 5, cache: Optional[str] = None, plot: bool = False, chunks: bool = False, verbose: int = 0, **grid_kws) -> "MetaPanda": """Performs exhaustive grid search analysis on the models selected. This function aims to encapsulate much of the functionality associated around `GridSearchCV` class within scikit-learn. With in-built caching options, flexible selection of inputs and outputs with the MetaPanda class. Parameters ---------- df : pd.DataFrame/MetaPanda The main dataset. y : str A selected y column. x : list/tuple of str, optional A list of selected column names for x or MetaPanda `selector`. models : list/dict, default=["Ridge", "Lasso"] tuple: list of model names, uses default parameters dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values) cv : int/tuple, default=5 If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` cache : str, optional If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file. plot : bool, optional If True, produces appropriate plot determining for each parameter. chunks : bool, optional If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks. This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk. verbose : int, optional If > 0, prints out statements depending on level. Other Parameters ---------------- grid_kws : dict, optional Additional keywords to assign to GridSearchCV. Raises ------ TypeError If one of the parameters has wrong input type Returns ------- cv_results : MetaPanda A dataframe result from GridSearchCV detailing iterations and all scores. Notes ----- From version 0.2.3 the `chunks` argument allows for fitting by parts. This means that breaks throughout a large pipeline will result only in losses up to the previous chunk. Chunk files are saved as '%filename_chunk%i.csv' so beware of clashes. Make sure to set `chunks=True` and `cache=str` where the `models` parameter is time-expensive. By default, `grid` tunes using the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- basic : Performs a rudimentary fit model with no parameter searching. sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, (pd.DataFrame, MetaPanda)) instance_check(x, (type(None), str, list, tuple, pd.Index)) instance_check(y, str) instance_check(cv, (int, tuple)) instance_check(cache, (type(None), str)) instance_check((plot, chunks), bool) bounds_check(verbose, 0, 4) if is_sklearn_model(models): models = [models] else: if isinstance(models, tuple): models = list(models) instance_check(models, (list, dict)) # set dataset if a pandas object _df = df.df_ if not isinstance(df, pd.DataFrame) else df # retrieve x columns if none # set up cv, repeats k, repeats = cv if isinstance(cv, tuple) else cv, 1 # do caching def _perform_fit(_df: MetaPanda, _x, _y, _k: int, _repeats: int, _models): rep = RepeatedKFold(n_splits=_k, n_repeats=_repeats) # the header is 'model_est' header = "model" # any basic regression model pipe = Pipeline([(header, LinearRegression())]) # get paramgrid - the magic happens here! pgrid = make_parameter_grid(_models, header=header) # join default grid parameters to given grid_kws def_grid_params = { "scoring": "neg_root_mean_squared_error", "n_jobs": -2, "verbose": verbose, "return_train_score": True, } def_grid_params.update(grid_kws) # create gridsearch gs = GridSearchCV(pipe, param_grid=pgrid, cv=rep, **def_grid_params) # make ml ready __xnp, __y = preprocess_continuous_X_y(_df, _x, _y) # fit the grid - expensive. gs.fit(__xnp, __y) # generate result _result = pd.DataFrame(gs.cv_results_) # associate model column to respective results _result["model"] = _result["param_model"].apply( lambda f: str(f).split("(")[0]) # set as MetaPanda _met_result = MetaPanda(_result) # cast down parameter columns to appropriate type _met_result.transform(pd.to_numeric, object, errors="ignore") return _met_result if cache is not None: if chunks: # if dictionary, we need to split this into 1-sized list/dict blocks. values = dictchunk(models, 1) if isinstance(models, dict) else models _cv_results = cached_chunk( _perform_fit, "_models", values, False, cache, verbose, _df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models, ) else: _cv_results = cache_f( cache, _perform_fit, _df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models, ) else: _cv_results = _perform_fit(_df=_df, _x=x, _y=y, _k=k, _repeats=repeats, _models=models) if plot: parameter_tune(_cv_results) return _cv_results
def optimize(df: "MetaPanda", x: SelectorType, y: str, models, cv: int = 5, verbose: int = 0): """Performs optimization grid analysis on the models selected. This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model. .. note:: optimization only works on *continuous* parameters with each model. TODO: complete `.ml.fit.optimize` function Parameters ---------- df : MetaPanda The main dataset. x : list/tuple of str A list of selected column names for x or MetaPanda `selector`. y : str A selected y column. models : tuple/dict tuple: list of model names, uses default parameters dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values) cv : int/tuple, optional (5, 10) If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` cache : str, optional If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file. plot : bool, optional If True, produces appropriate plot determining for each parameter. chunks : bool, optional If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks. This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk. verbose : int, optional If > 0, prints out statements depending on level. Returns ------- cv_results : MetaPanda A dataframe result from GridSearchCV detailing iterations and all scores. By default, `optimize` tunes using the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- grid : Performs exhaustive grid search analysis on the models selected. sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, MetaPanda) instance_check(x, (str, list, tuple, pd.Index)) instance_check(y, str) nonnegative((cv, verbose), int) instance_check(models, (tuple, list, dict)) bounds_check(verbose, 0, 4) _df = df.df_ if not isinstance(df, pd.DataFrame) else df _xcols = select_xcols(_df, x, y) _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y) # define the parameter sets param_sets = make_optimize_grid(models) for m, params in zip(models, param_sets): model = find_sklearn_model(m)[0] inits, bounds = optimize_grid_for_model(params) # minimize for every i element mins = [ so.minimize( _min_cross_val_scores, x0=i, args=(_xnp, _y, model, params, cv), bounds=bounds, ) for i in inits ] pass
def widebox(data: Union[List, np.ndarray, pd.DataFrame], colors: Optional[_ListLike] = None, measured: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, vert: bool = True, sort: bool = True, outliers: bool = True, notch: bool = False, with_strip: bool = False, capsize: float = 1.0, width: float = 0.7, grid: bool = True, title: Optional[str] = None, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, strip_jitter: float = 0.15, theme="white_circle", **plot_kwargs): """Plots a 2D boxplot with data oriented in wide-form. Parameters ---------- data : list, np.ndarray or pd.DataFrame (2d) The raw data to plot as a box. If data is of type pd.DataFrame: columns represent X-axis colors : list, tuple, optional Represents colors for each x-variable measured : str, optional A name for the measured variable ax : matplotlib.ax object, optional, default=None If None, creates a plot. vert : bool, default=True Determines whether to draw the plot vertically or horizontally sort : bool, default=True Determines whether to sort the data by numerical value outliers : bool, default=True If True, displays fliers as outliers notch : bool, default=False Determines whether to draw a notched plot with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case capsize : float, default=1.0 Defines the length of the caps width : float, default=0.7 Determines the width/height of the box grid : bool, default=True If True: draws gridlines for the numeric axis title : str, optional Sets the title of the axes if a string is passed label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible strip_jitter : float, default=0.15 With stripplot, defines the amount of jitter in the variables theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot See Also -------- matplotlib.pyplot.boxplot seaborn.boxplot seaborn.boxenplot References ---------- Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes """ instance_check(data, (list, np.ndarray, pd.DataFrame)) instance_check((colors, spines), (type(None), list, pd.Index)) instance_check(ax, (type(None), mpl.axes.Axes)) instance_check((vert, sort, notch, outliers, grid, with_strip), bool) instance_check((capsize, width, strip_jitter, label_rotation), (float, int)) instance_check(theme, str) instance_check(label_max_length, int) bounds_check(width, 0.0, 1.0) bounds_check(strip_jitter, 0.0, 1.0) bounds_check(label_rotation, 0.0, 360.0) if isinstance(data, pd.DataFrame): # select float, int subset ss = data.select_dtypes(include=[float, int]) _data = np.asarray(ss) _labels = ss.columns elif isinstance(data, (list, np.ndarray)): _data = np.asarray(data) _labels = None else: raise TypeError("data matrix is not of type np.ndarray") _style = _get_flier_style(theme) # negative-exponential increase in figure size with more features def _figure_spacing(x): return np.exp(-0.35 * x) * x if with_strip: outliers = False if ax is None and vert: fig, ax = plt.subplots(figsize=(2.5 + _figure_spacing(_data.shape[1]), 7)) elif ax is None and not vert: fig, ax = plt.subplots(figsize=(7, 2.5 + _figure_spacing(_data.shape[1]))) if spines is None: spines = ("left", "top", "right", "bottom") # sort the data by the mean if selected if sort: _order = np.argsort(np.mean(_data, axis=0)) _data = _data[:, _order] _labels = _labels[_order] box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot(_data, vert=vert, patch_artist=True, widths=width, showfliers=outliers, notch=notch, flierprops=_style, boxprops=dict(alpha=box_alpha), **plot_kwargs) # define boxplot extras _define_boxplot_arguments(ax, patch_obj, vert, measured, grid, spines, capsize, None) # define basic colours - overrides if needs be colors = _kcolor_arrangement(patch_obj, colors, k=_data.shape[1]) # label axes _label_axes(ax, _labels, vert, label_rotation, label_max_length) if title is not None: ax.set_title(title) # perform stripplots if with_strip: for n in range(_data.shape[1]): # plot x strips _overlay_stripplot(_data[:, n], ax, n + 1, width, colors[n], vert, outliers, strip_jitter) return ax
def bibox1d(X: _ArrayLike, Y: _ArrayLike, colors: Optional[_ListLike] = None, labels: Optional[_ListLike] = None, measured: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, mannwhitney: bool = True, with_strip: bool = False, vertical: bool = True, notch: bool = False, capsize: float = 1.0, outliers: bool = True, grid: bool = True, width: Union[float, List[float]] = 0.7, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, strip_jitter: float = 0.15, theme: str = "white_circle", **plot_kwargs): """Plots two 1-dimensional boxplots using vectors `X`, `Y`. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The first data column to draw. Must be numeric. Y : list/tuple/np.ndarray/pd.Series (1d) The second data column to draw. Must be numeric. colors : str/list of str, optional If None, uses a default color labels : str/list of str, optional If set, draws this on the appropriate axis, if None, does nothing If X/Y is of type pandas.Series, uses this label instead. measured : str, optional A label to define what the measurement is ax : matplotlib.ax object, optional, default=None If None, creates a plot. mannwhitney : bool, default=True If True, performs a Mann-Whitney U test between the values with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case vertical : bool, default=True Determines whether to draw the plot vertically or horizontally notch : bool, default=False Determines whether to draw a notched plot capsize : float, default=1.0 Defines the length of the caps outliers : bool, default=True If True, displays fliers as outliers grid : bool, default=True If True: draws gridlines for the numeric axis width : float, default=0.7 Determines the width/height of the box label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible strip_jitter : float, default=0.15 With stripplot, defines the amount of jitter in the variables theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot See Also -------- matplotlib.pyplot.boxplot References ---------- Inspiration from https://github.com/jbmouret/matplotlib_for_papers#colored-boxes """ instance_check((X, Y), (list, tuple, np.ndarray, pd.Series)) instance_check((colors, labels, spines), (type(None), list, pd.Index)) instance_check(ax, (type(None), mpl.axes.Axes)) instance_check((mannwhitney, vertical, notch, outliers, grid, with_strip), bool) instance_check((capsize, width, strip_jitter, label_rotation), (float, int)) instance_check(theme, str) instance_check(label_max_length, int) bounds_check(strip_jitter, 0.0, 1.0) _X = as_flattened_numpy(X) _Y = as_flattened_numpy(Y) _style = _get_flier_style(theme) if ax is None and vertical: fig, ax = plt.subplots(figsize=(3.5, 7)) elif ax is None and not vertical: fig, ax = plt.subplots(figsize=(7, 3.5)) if with_strip: outliers = False if spines is None: if vertical and mannwhitney: spines = ("bottom", "left", "right") elif not vertical and mannwhitney: spines = ("bottom", "left", "top") else: spines = ("bottom", "left", "top", "right") # sort out labels if labels is None: labels = [ X.name if isinstance(X, pd.Series) else "", Y.name if isinstance(Y, pd.Series) else "", ] box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot([_X, _Y], vert=vertical, patch_artist=True, showfliers=outliers, notch=notch, widths=width, flierprops=_style, boxprops=dict(alpha=box_alpha), **plot_kwargs) # define boxplot extras _define_boxplot_arguments(ax, patch_obj, vertical, measured, grid, spines, capsize, None) # define basic colours - overrides if needs be colors = _kcolor_arrangement(patch_obj, colors) # label axes _label_axes(ax, labels, vertical, label_rotation, label_max_length) # if we have stripplot, draw this if with_strip: # plot x strips _overlay_stripplot(_X, ax, 1, width, colors[0], vertical, outliers, strip_jitter) _overlay_stripplot(_Y, ax, 2, width, colors[1], vertical, outliers, strip_jitter) # if we have mann-whitney append this info if mannwhitney: # determine mann-whitney U test z, p = mannwhitneyu(_X, _Y) # p-value * 2 p *= 2 star = _get_stars(p) # get dimensions to annotate joined = np.concatenate((_X, _Y)) _max, _min = np.max(joined), np.min(joined) # annotate on mann-whitney test if vertical: ax.annotate( "", xy=(1, _max), xycoords="data", xytext=(2, _max), textcoords="data", arrowprops=dict(arrowstyle="-", ec="#666666", connectionstyle="bar,fraction=0.2"), ) # add mw text ax.text( 1.5, _max + np.abs(_max - _min) * 0.1, star, horizontalalignment="center", verticalalignment="center", ) else: ax.annotate( "", xy=(_max, 2), xycoords="data", xytext=(_max, 1), textcoords="data", arrowprops=dict(arrowstyle="-", ec="#666666", connectionstyle="bar,fraction=0.2"), ) # add mw text ax.text( _max + np.abs(_max - _min) * 0.1, 1.5, star, horizontalalignment="center", verticalalignment="center", ) return ax
def box1d(X: _ArrayLike, color: Optional[str] = None, label: Optional[str] = None, ax: Optional[mpl.axes.Axes] = None, with_strip: bool = False, vertical: bool = True, notch: bool = False, capsize: float = 1.0, outliers: bool = True, axis_scale: Optional[Union[str, Callable]] = None, grid: bool = True, width: float = 0.7, label_rotation: float = 0.0, label_max_length: int = 25, spines: Optional[_ListLike] = None, theme: str = "white_circle", **plot_kwargs): """Plots a 1-dimensional boxplot using a vector. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The data column to draw. Must be numeric. color : str, optional If None, uses a default color label : str, optional If set, draws this on the appropriate axis, if None, does nothing If X is of type pandas.Series, uses this label instead. ax : matplotlib.ax object, optional, default=None If None, creates a plot. with_strip : bool, default=False If True, draws a stripplot over the top of the boxplot, in a similar colour `outliers` are set to False in this case vertical : bool, default=True Determines whether to draw the plot vertically or horizontally notch : bool, default=False Determines whether to draw a notched plot capsize : float, default=1.0 Defines the length of the caps outliers : bool, default=True If True, displays outfliers as outliers axis_scale: str/callable, optional Scales the data along the axis. If str, use {'log', 'sqrt', 'log2'} If callable, must reference a `np.*` function which takes array X and returns X' grid : bool, default=True If True: draws gridlines for the numeric axis width : float, default=0.7 Determines the width/height of the box label_rotation : float, default=0 The degrees of rotation to the ticklabels label_max_length : int, default=25 If any label exceeds this length, it truncates it spines : tuple, default=('top','left',bottom','right') Defines which spines are to be visible theme : str, default="white_circle" Choose a 'theme' for the outliers, from {'red_square', 'green_diamond'} Other Parameters ---------------- plot_kwargs : dict keyword arguments to pass to `ax.boxplot` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-boxplot """ instance_check(X, (np.ndarray, pd.Series, list, tuple)) instance_check((vertical, notch, outliers, grid, with_strip), bool) instance_check(spines, (type(None), list)) instance_check(theme, str) instance_check((label, color), (type(None), str)) instance_check((capsize, width), float) instance_check(label_rotation, (int, float)) instance_check(label_max_length, int) bounds_check(width, 0.0, 1.0) # convert option to numpy _X = as_flattened_numpy(X) _style = _get_flier_style(theme) # convert X data if we have axis_scale if axis_scale: _X = _convert_x_scale(_X, axis_scale) if with_strip: outliers = False if ax is None and vertical: fig, ax = plt.subplots(figsize=(2.5, 5)) elif ax is None and not vertical: fig, ax = plt.subplots(figsize=(5, 2.5)) if spines is None: spines = ("left", "top", "right", "bottom") box_alpha = 1.0 if not with_strip else 0.5 patch_obj = ax.boxplot(_X, vert=vertical, patch_artist=True, showfliers=outliers, notch=notch, widths=width, boxprops=dict(alpha=box_alpha), flierprops=_style, **plot_kwargs) # define basic arguments _define_boxplot_arguments(ax, patch_obj, vertical, None, grid, spines, capsize, axis_scale) # define colour features color = _color_arrangement(ax, patch_obj, color) # label the appropriate axes _label_axes( ax, X.name if isinstance(X, pd.Series) else label, vertical, label_rotation, label_max_length, ) # plot the strips if with_strip: _overlay_stripplot(_X, ax, 1, width, color, vertical, outliers, strip_jitter=0.15) return ax
def histogram(X: _ArrayLike, kde: str = "freeform", bins: Optional[Union[int, _ListLike]] = None, density: bool = True, stat: bool = False, ax: Optional[mpl.axes.Axes] = None, x_label: str = "", title: str = "", kde_range: float = 1e-3, smoothen_kde: bool = True, verbose: int = 0, *hist_args, **hist_kwargs) -> mpl.axes.Axes: """Draws pretty histograms using `X`. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The data column to draw. Must be numeric. kde : str/tuple of str, optional, default="freeform" If None, does not draw a KDE plot If 'freeform': fits the best KDE to the points If 'auto': attempts to fit the best `continuous` distribution If list/tuple: uses 'auto' to fit the best distribution out of options else, choose from available distributions in `scipy.stats` bins : int, optional If None, uses optimal algorithm to find best bin count density : bool, default=True If True, uses density approximation stat : bool, default=False If True, sets statistical variables in legend ax : matplotlib.ax object, optional, default=None If None, creates one. x_label : str, optional, default=None If None, uses `x-axis`. title : str, optional, default="" If None, uses `Default Title` kde_range : float, default=1e-3 Defines the precision on the KDE range if plotted between (1e-3, 1-1e-3) Must be > 0. smoothen_kde : bool, default=True If discrete-distribution, applies smoothing function to KDE if True verbose : int, default=0 If > 0, prints out useful messages Other Parameters ---------------- args ; list Arguments to pass to `ax.hist` kwargs : dict Keyword arguments to pass to `ax.hist` Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-histogram """ instance_check(X, (np.ndarray, pd.Series, list, tuple)) instance_check((density, stat, smoothen_kde), bool) instance_check((title, x_label), str) instance_check(kde, (str, type(None), list, tuple)) instance_check(kde_range, float) bounds_check(verbose, 0, 4) # convert to numpy. _X = as_flattened_numpy(X) # make bins if set to None if bins is None: # if X is float, use freedman_diaconis_bins determinant, else simply np.arange for integer input. bins = get_bins(_X) if kde: density = True # plot histogram if ax is None: fig, ax = plt.subplots(figsize=(8, 5)) if stat: stat_label = "mean: {:0.2f}, sd: {:0.3f},\n skew: {:0.3f} kurt: {:0.3f}".format( np.nanmean(_X), np.nanstd(_X), stats.skew(_X), stats.kurtosis(_X)) # plot the histogram _plot_hist(_X, ax, bins=bins, density=density, rwidth=0.9, label=stat_label, *hist_args, **hist_kwargs) ax.legend(loc="best") else: # plot the histogram _plot_hist(_X, ax, bins=bins, density=density, rwidth=0.9, *hist_args, **hist_kwargs) ax.set_title(title) if density: ax.set_ylabel("Density") else: ax.set_ylabel("Counts") if kde is not None: if kde == "auto" or isinstance(kde, (list, tuple)): # uses slim parameters by default auto_fitted = auto_fit(_X, kde) best_model_ = auto_fitted.loc[auto_fitted["r"].idxmax()] # set kde to the name given x_kde, y_kde, model = univariate_kde( _X, bins, best_model_.name, kde_range=1e-3, smoothen_kde=smoothen_kde, verbose=verbose, return_dist=True, ) elif (kde == "freeform") or hasattr(stats, kde): # fetches the kde if possible auto_fitted = None x_kde, y_kde, model = univariate_kde( _X, bins, kde, kde_range=1e-3, smoothen_kde=smoothen_kde, verbose=verbose, return_dist=True, ) else: raise ValueError( "kde value '{}' not found in scipy.stats".format(kde)) # plot ax.plot(x_kde, y_kde, "-", color="r") else: auto_fitted = None model = None if x_label == "": x_label = _assign_x_label( title, X.name if isinstance(X, pd.Series) else "", kde is not None, auto_fitted, model if not kde == "freeform" else None, ) ax.set_xlabel(x_label) return ax
def test_bounds_check(self): assert utils.bounds_check(math.pi, math.pi - 0.00001, math.pi + 0.00001) assert utils.bounds_check(5, 5 - 1, 5 + 1)
def basic( df: Union[pd.DataFrame, "MetaPanda"], y: str, x: Optional[SelectorType] = None, cv: Union[int, Tuple[int, int]] = 5, model: str = "LinearRegression", cache: Optional[str] = None, plot: bool = False, verbose: int = 0, **model_kws ): """Performs a rudimentary fit model with no parameter searching. This function helps to provide a broad overview of how successful a given model is on the inputs of x -> y. `cv` returns scoring and timing metrics, as well as coefficients if available, whereas `yp` provides predicted values for each given `y`. Parameters ---------- df : DataFrame / MetaPanda The main dataset. y : str Target/dependent variable (as column) x : list / tuple of str, optional A list of selected column names for independent variables. If None uses all except `y` column cv : int / tuple, default=5 If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` model : str / sklearn model, default="LinearRegression" The name of a scikit-learn model, or the model object itself. cache : str, optional If not None, stores the resulting model parts in JSON and reloads if present. plot : bool, default=False If True, produces `overview_plot` inplace. verbose : int, default=0 If > 0, prints out statements depending on level. Other Parameters ---------------- model_kws : dict, optional Keywords to pass to the sklearn model which are not parameterized. Returns ------- cv : MetaPanda A dataframe result of cross-validated repeats. Can include w_ coefficients. yp : pd.Series The predictions for each of y Notes ----- Shorthand names for the models, i.e `lm` for LinearRegression or `gauss` for a GaussianProcessRegressor, are accepted. By default, `fit_basic` uses the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- fit_grid : Performs exhaustive grid search analysis on the models selected. References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, (pd.DataFrame, MetaPanda)) instance_check(x, (type(None), str, list, tuple, pd.Index)) instance_check(y, str) instance_check(cv, (int, tuple)) instance_check(cache, (type(None), str)) instance_check(plot, bool) bounds_check(verbose, 0, 4) assert is_sklearn_model(model), "model '{}' is not a valid sklearn model." _df = df.df_ if not isinstance(df, pd.DataFrame) else df _xcols = select_xcols(_df, x, y) rep = _define_regression_kfold_object(cv) lm, pkg_name = find_sklearn_model(model, "regression") # assign keywords to lm lm.set_params(**model_kws) # make data set machine learning ready. _x, _y = preprocess_continuous_X_y(_df, _xcols, y) if verbose > 0: print( "full dataset: {}/{} -> ML: {}/{}({},{})".format( _df.n_, _df.p_, _df.shape[0], _df.shape[1], _x.shape[1], 1 ) ) # function 1: performing cross-validated fit. def _perform_cv_fit( _x: np.ndarray, _columns: pd.Index, _y: np.ndarray, _rep, _lm, package_name: str ): # cv cross-validate and wrap. score_mat = pd.DataFrame( cross_validate( _lm, _x, _y, cv=_rep, scoring="neg_root_mean_squared_error", return_estimator=True, return_train_score=True, n_jobs=-2, ) ) # append results to cv # if repeatedkfold, add n_repeats if isinstance(rep, RepeatedKFold): score_mat["k"] = np.repeat(np.arange(rep.n_splits), rep.n_repeats) else: score_mat["k"] = np.arange(rep.n_splits) # extract coefficients coef = _extract_coefficients_from_model(score_mat, _xcols, package_name) # integrate coefficients if not isinstance(coef, (list, tuple)): score_mat = score_mat.join(coef.add_prefix("w__")) # drop estimator score_mat.drop("estimator", axis=1, inplace=True) # wrap as metapanda and return return MetaPanda(score_mat) # function 2: performing cross-validated predictions. def _perform_prediction_fit( _x: np.ndarray, _y: np.ndarray, _ind: pd.Index, _yn: str, _rep, _lm ) -> pd.DataFrame: return pd.Series(cross_val_predict(_lm, _x, _y, cv=_rep), index=_ind).to_frame( _yn ) if cache is not None: cache_cv = insert_suffix(cache, "_cv") cache_yp = insert_suffix(cache, "_yp") _cv = cache_function( cache_cv, _perform_cv_fit, _x=_x, _xcols=_xcols, _y=_y, _rep=rep, _lm=lm, package_name=pkg_name, ) _yp = cache_function( cache_yp, _perform_prediction_fit, _x=_x, _y=_y, _ind=_df.index, _yn=y, _rep=rep, _lm=lm, ) else: _cv = _perform_cv_fit(_x, _xcols, _y, rep, lm, pkg_name) _yp = _perform_prediction_fit(_df, _x, _y, y, rep, lm) if plot: overview(_df, x, y, _cv, _yp) # return both. return _cv, _yp
def pca( df: Union[np.ndarray, pd.DataFrame, MetaPanda], x: Optional[SelectorType] = None, preprocess: bool = True, refit: bool = False, with_transform: bool = False, plot: bool = False, whiten: bool = False, sparsity: float = 0.0, variance_threshold: float = 0.9, plot_kwargs: Optional[Dict] = None, ): """Fits a PCA model to the data set. .. note:: Supports vectorization and `Param`. See `turb.vectorize`. Parameters ---------- df : np.ndarray / pd.DataFrame / MetaPanda The full dataset x : selector A subset of df to select (if MetaPanda), optionally preprocess : bool, default=True Preprocesses the data matrix X if set. Only preprocesses if pandas.DataFrame or above Uses the `.pipe.clean1` function which includes zscore, dropping object columns and NA. refit : bool, default=False If True, a second PCA model is fitted using the 'best' proportional variance/AUC which is returned. with_transform : bool, default=False If True, returns transformed `X` as a second argument. plot : bool, default=False If True, plots an 'overview' of the PCA result whiten : bool, default=False When True (False by default) the components_ vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions. sparsity : float, default = 0.0 If `sparsity` > 0, uses `SparsePCA` algorithm to induce sparse components using L1 norm. variance_threshold : float, default=0.9 Determines the threshold of 'cumulative proportional variance' to select a refitted model from. Must be 0 <= `variance_threshold` <= N. plot_kwargs : dict, optional optional arguments to pass to `pca_overview`. Returns ------- model : sklearn.decomposition.PCA A PCA model X_t : np.ndarray/pd.DataFrame, optional The transformed input matrix `X`. Returned if `with_transform` is True. """ instance_check(df, (np.ndarray, pd.DataFrame, MetaPanda)) instance_check((preprocess, plot, whiten, refit, with_transform), bool) instance_check(plot_kwargs, (type(None), dict)) instance_check(sparsity, float) bounds_check(variance_threshold, 0.0, 1.0) # define our selected columns if x is None: if not isinstance(df, np.ndarray): x = df.columns else: x = pd.Index(patproduct("X%d", range(df.shape[1]))) # extract x columns if isinstance(df, MetaPanda): cols = df.view(x) else: cols = x # generate ML ready subset if preprocess and not isinstance(df, np.ndarray): _x = preprocess_continuous_X(df, cols) else: _x = df # determine PCA model _model = _create_pca_model(_x.shape[1], sparsity, whiten) # fit the model _model.fit(_x) if plot: if plot_kwargs is None: plot_kwargs = {} overview_pca( _model, labels=cols, cutoff_selection=variance_threshold, **plot_kwargs ) # if we refit the model, refit it and return if refit: # calculate best index (N) _ycum = np.cumsum(_model.explained_variance_ratio_) new_n = np.where(_ycum > variance_threshold)[0][0] + 1 # fit a new PCA model. _pcan = _create_pca_model(new_n, sparsity, whiten) _pcan.fit(_x) if with_transform: return _pcan, pd.DataFrame(_pcan.transform(_x), index=_x.index) else: return _pcan else: if with_transform: return _model, pd.DataFrame(_model.transform(_x), index=_x.index) else: return _model
def correlate( data: Union[pd.DataFrame, MetaPanda], x: Optional[SelectorType] = None, y: Optional[SelectorType] = None, covar: Optional[SelectorType] = None, cartesian_covar: bool = False, output: str = "full", method: str = "spearman", verbose: int = 0, ) -> pd.DataFrame: """Correlates X and Y together to generate a list of correlations. If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame Parameters --------- data : pd.DataFrame / MetaPanda The full dataset. x : (str, list, tuple, pd.Index), optional Subset of input(s) for column names. if None, uses the full dataset. Y must be None in this case also. y : (str, list, tuple, pd.Index), optional Subset of output(s) for column names. if None, uses the full dataset (from optional `x` subset) covar : (str, list, tuple, pd.Index), optional set of covariate(s). Covariates are needed to compute partial correlations. If None, uses standard correlation. cartesian_covar : bool, default=False If True, and if covar is not None, separates every element in covar to individually control for using the cartesian product output : str, default="full" Choose from {'full', 'score'}. Score just returns `r` number. method : str, default="spearman" Method to correlate with. Choose from: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'biserial' : Biserial correlation (continuous and boolean data only) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) verbose : int, default=0 If > 0, prints out useful debugging messages Returns ------- R : pd.DataFrame correlation rows (based on pingouin structure) Examples -------- >>> import turbopanda as turb >>> data = turb.read('example.json') >>> R = turb.correlate(data) # uses full dataset X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset X M Y X 1.000000 0.392251 0.059771 M 0.392251 1.000000 0.545618 Y 0.059771 0.545618 1.000000 # correlates X columns against Ybin >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin') X M Y Ybin 1.000000 0.392251 0.059771 # correlates X against Ybin controlling for >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y X Ybin -0.149210 # using a different technique >>> R = turb.correlate(data, method="shepherd") X M Y Mbin Ybin X 1.000000 0.392251 0.059771 -0.014405 -0.149210 M 0.392251 1.000000 0.545618 -0.015622 -0.094309 Y 0.059771 0.545618 1.000000 -0.007009 0.161334 Mbin -0.014405 -0.015622 -0.007009 1.000000 -0.076614 Ybin -0.149210 -0.094309 0.161334 -0.076614 1.000000 """ # data cannot be NONE instance_check(data, (pd.DataFrame, MetaPanda)) instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index)) instance_check(cartesian_covar, bool) belongs( method, ( "pearson", "spearman", "kendall", "biserial", "percbend", "shepherd", "skipped", ), ) belongs(output, ("full","score")) bounds_check(verbose, 0, 4) # downcast to dataframe option df = data.df_ if not isinstance(data, pd.DataFrame) else data # downcast if list/tuple/pd.index is of length 1 x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y # convert using `view` if we have string instances. if isinstance(x, str): x = pattern(x, df.columns) if isinstance(y, str): y = pattern(y, df.columns) if isinstance(covar, str): covar = pattern(covar, df.columns) # perform a check to make sure every column in `covar` is continuous. if covar is not None: if not is_dataframe_float(data[covar]): raise TypeError( "`covar` variables in `correlate` all must be of type `float`/continuous." ) # execute various use cases based on the presense of x, y, and covar, respectively. if x is None and y is None: # here just perform matrix-based correlation comb = it.combinations_with_replacement(df.columns, 2) niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2) elif isinstance(x, (list, tuple, pd.Index)) and y is None: # use a subset of x, in union with covar comb = it.combinations_with_replacement(x, 2) niter = (len(x)**2) // 2 + (len(x) // 2) elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str): # list of x, y str -> matrix-vector cartesian product comb = it.product(x, [y]) niter = len(x) elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str): # list of y, x str -> matrix-vector cartesian product comb = it.product(y, [x]) niter = len(y) elif isinstance(x, (list, tuple, pd.Index)) and isinstance( y, (list, tuple, pd.Index) ): # list of x, y -> cartesian product of x: y terms comb = it.product(x, y) niter = len(x) * len(y) else: raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar)) # return the combination of these effects. return _corr_combination( df, comb, niter, covar, cartesian_covar, method, output, verbose )