def common_substrings( a: Union[str, List[str]], b: Optional[Union[str, List[str]]] = None, min_length: int = 2, ) -> Union[str, Series]: """Given at least one pair of strings, find all the best common substring matches. By default, if one a is passed, it uses the pairwise combinations between all values in the list, otherwise with a + b, the cartesian product of the lists is used. Parameters ---------- a : str/list of str A word or list of words to find the common substring to b : str/list of str, optional A word or list of words to find the common substring to If None, pairwise combinations in a are used min_length: int, default=2 The minimum accepted length of string for a given pair Returns ------- z_up : str/Series str returned if (a, b) are strs, else Series of valuecounts """ instance_check(a, (str, list, tuple, Index)) instance_check(b, (type(None), str, list, tuple, Index)) nonnegative(min_length, int) # prevent a case where a can be a str, b is None disallow_instance_pair(a, str, b, type(None)) filters = ("", "_", "__", "-") if isinstance(a, str) and isinstance(b, type(None)): return a elif isinstance(a, str) and isinstance(b, str): return _single_common_substring_match(a, b) else: if isinstance(a, str): a = [a] elif isinstance(b, str): b = [b] # determine pair set. if b is None: # combination iterator pair_groups = it.combinations(a, 2) else: # cartesian product iterator pair_groups = it.product(a, b) # generate pairs z = [_single_common_substring_match(i, j) for i, j in pair_groups] def filter_func(x): """Custom function which filters according to tuple and keeps elements >= min length""" return (x in filters) or (len(x) < min_length) or (z.count(x) <= 1) # filter out naff elements z_up = list(it.filterfalse(filter_func, z)) # save as series valuecounts. return Series(z_up, dtype=object).value_counts()
def density( X: np.ndarray, Y: Optional[np.ndarray] = None, Z: Optional[np.ndarray] = None, r: Optional[int] = None, ) -> np.ndarray: """Estimates the density of X using binning, accepts np.ndarray. Parameters ---------- X : np.ndarray (n,) The first dimension Y : np.ndarray (n,), optional The second dimension Z : np.ndarray (n,), optional The third dimension r : int, optional The number of bins for each dimension, If None, uses the freedman-diaconis rule Returns ------- d : np.ndarray (r,...) The density in binned-dimensions """ instance_check(X, np.ndarray) instance_check((Y, Z), (type(None), np.ndarray)) instance_check(r, (type(None), int)) if r is None: r = min(freedman_diaconis_bins(X), 50) else: nonnegative(r, int) if Y is None and Z is None: _X = remove_na(X) return np.histogram(_X, bins=r, density=True)[0] elif Z is None: _X, _Y = remove_na(X, Y, paired=True) return np.histogram2d(_X, _Y, bins=(r, r), density=True)[0] else: return np.histogramdd(np.vstack((X, Y, Z)).T, bins=(r, r, r), density=True)[0]
def head(self, k: int = 5) -> pd.DataFrame: """Look at the top k rows of the dataset. See `pd.DataFrame.head` documentation for details. Parameters -------- k : int, optional Must be 0 < k < n. Returns ------- ndf : pandas.DataFrame First k rows of df_ See Also -------- pandas.DataFrame.head : Return the first n rows. """ nonnegative(k, int) return self.df_.head(k)
def covariance_matrix(p, corr_ratio=0.5, diag_var=1., random_direction=False): """Generates a randomly-generated 'correlated' covariance matrix. This is useful in situations where you want to create correlated synthetic data to test an algorithm. Bare in mind that `corr_ratio` follows the relationship rho = [1/p-1, 1], so negative correlations will be clipped at higher dimensions to ensure semipositive definite structures. Parameters ---------- p : int The number of dimensions. p must be >= 2 corr_ratio : float [-1..1] The proportion of 'correlation' within the matrix; 0 no correlation, 1 full positive correlation and -1 full negative correlation. diag_var : float The values on the diagonal, with small error (5e-03) random_direction : bool, default=False Correlation is randomly positive or negative if True, else uses the sign(corr_ratio). Returns ------- cov : np.ndarray((p, p)) Covariance matrix """ nonnegative(p, int) # p must be greater than 1 to be multivariate gaussian. if p < 2: raise ValueError("'p' must be > 1") # clips ratio into the range [0, 1] _corr_ratio = np.clip(corr_ratio, 1. / (p - 1), 0.999) if not np.isclose(corr_ratio, _corr_ratio): warnings.warn( "`corr_ratio` parameter is clipped from {:0.3f} to [{:0.3f}, 1]". format(corr_ratio, _corr_ratio)) return _create_cov_matrix(p, _corr_ratio, diag_var, random_direction)
def optimize(df: "MetaPanda", x: SelectorType, y: str, models, cv: int = 5, verbose: int = 0): """Performs optimization grid analysis on the models selected. This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model. .. note:: optimization only works on *continuous* parameters with each model. TODO: complete `.ml.fit.optimize` function Parameters ---------- df : MetaPanda The main dataset. x : list/tuple of str A list of selected column names for x or MetaPanda `selector`. y : str A selected y column. models : tuple/dict tuple: list of model names, uses default parameters dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values) cv : int/tuple, optional (5, 10) If int: just reflects number of cross-validations If Tuple: (cross_validation, n_repeats) `for RepeatedKFold` cache : str, optional If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file. plot : bool, optional If True, produces appropriate plot determining for each parameter. chunks : bool, optional If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks. This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk. verbose : int, optional If > 0, prints out statements depending on level. Returns ------- cv_results : MetaPanda A dataframe result from GridSearchCV detailing iterations and all scores. By default, `optimize` tunes using the root mean squared error (RMSE). There is currently no option to change this. By default, this model assumes you are working with a regression problem. Classification compatibility will arrive in a later version. See Also -------- grid : Performs exhaustive grid search analysis on the models selected. sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator References ---------- .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011. """ # checks instance_check(df, MetaPanda) instance_check(x, (str, list, tuple, pd.Index)) instance_check(y, str) nonnegative((cv, verbose), int) instance_check(models, (tuple, list, dict)) bounds_check(verbose, 0, 4) _df = df.df_ if not isinstance(df, pd.DataFrame) else df _xcols = select_xcols(_df, x, y) _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y) # define the parameter sets param_sets = make_optimize_grid(models) for m, params in zip(models, param_sets): model = find_sklearn_model(m)[0] inits, bounds = optimize_grid_for_model(params) # minimize for every i element mins = [ so.minimize( _min_cross_val_scores, x0=i, args=(_xnp, _y, model, params, cv), bounds=bounds, ) for i in inits ] pass
def overview_pca( model, distance_color: bool = True, labels: Optional[pd.Index] = None, cutoff_selection: float = 0.9, n_samples_annotate: int = 6, n_pcs: int = 5, ax_size: int = 4, ): """Provides an overview plot from a PCA result. Parameters ---------- model : sklearn.decomposition.PCA A fitted PCA model. distance_color : bool, default=True If True, plots the magnitude of each PC as a color labels : np.ndarray (n,) of str / pd.Series / list / tuple, optional If not None, provides a label for every PC component (dimension), and annotates the most 'outlier' like samples in plot 1 cutoff_selection : float, default=0.9 The cutoff for proportional variance to select for n_samples_annotate : int, default=10 Defines the number of labels to show if `labels` is not None in plot 1 n_pcs : int, default=5 The number of principle components to consider in plot 3 ax_size : int, default=4 The default size for each axes. Other Parameters ---------------- scatter_kws : dict keywords to pass to `plt.scatter` """ instance_check(distance_color, bool) instance_check(labels, (type(None), np.ndarray, pd.Series, pd.Index, list, tuple)) nonnegative( ( n_samples_annotate, n_pcs, ax_size, ), int, ) if labels is not None: fig, axes = gridplot(3, ax_size=ax_size) else: fig, axes = gridplot(2, ax_size=ax_size) if n_samples_annotate > model.n_components_: n_samples_annotate = model.n_components_ - 1 if n_pcs > model.n_components_: n_pcs = model.n_components_ - 1 # 1 plot the scatter of PC _plot_pca_scatter(model, axes[0], distance_color) # 2 plot the line AUC for explained variance _explained_variance_plot(model, axes[1], cutoff=cutoff_selection) # if annotate, we annotate the scatter plot with samples. if labels is not None: # check to make sure labels is same length as components _annotate_on_magnitude(model, labels, n_samples_annotate, axes[0]) # 3 plot the top N components by the `most important eigenvector values` _x3, _y3, _sel_labels = _best_principle_eigenvectors( model, labels=labels, k=n_samples_annotate, p=n_pcs ) _best_eigenvector_plot( _x3, _y3, _sel_labels, axes[-1], nk=(n_samples_annotate, n_pcs) ) axes[-1].set_title("Top {} eigenvectors".format(n_samples_annotate)) fig.tight_layout()
def hist_grid(mdf: Union[DataFrame, "MetaPanda"], subset: SelectorType, arrange: str = "square", plot_size: int = 3, shared_dist: str = "auto", savepath: Optional[Union[str, bool]] = None, **hist_kws): """ Plots a grid of histograms comparing the distributions in a MetaPanda selector. Parameters -------- mdf : turb.MetaPanda The dataset subset : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings arrange : str Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row prioritises plots row-like, and column-wise for column. plot_size : int, default=3 The size of each axes shared_dist : str/tuple of str/dict, default="auto" Determines what KDE to fit to the data, set to None if you don't want If tuple/list: attempts using these specified distributions If dict: maps column name (k) to distribution choice (v) savepath : None, bool, str saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved. Other Parameters ---------------- hist_kws : dict Keywords to pass to `turb.plot.histogram` Returns ------- None """ # checks instance_check(shared_dist, (type(None), str, list, tuple, dict)) instance_check(savepath, (type(None), str, bool)) nonnegative(plot_size, int) belongs(arrange, ["square", "row", "column"]) # make a metapanda if we have a dataframe. _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf # get selector selection = _mdf.view(subset) # assuming we've selected something... if selection.size > 0: fig, axes = gridplot(len(selection), arrange, ax_size=plot_size) if not isinstance(shared_dist, dict): for i, x in enumerate(selection): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=shared_dist, **hist_kws) fig.tight_layout() else: for i, (x, d) in enumerate(shared_dist.items()): _ = histogram(_mdf[x].dropna(), ax=axes[i], title=x, kde=d, **hist_kws) # iterate over any 'remaining' columns in selection and handle appropriately remaining = difference(selection, tuple(shared_dist.keys())) if remaining.shape[0] > 0: for i, x in enumerate(remaining): _ = histogram(_mdf[x].dropna(), ax=axes[i + len(shared_dist)], title=x, kde="auto", **hist_kws) fig.tight_layout() if isinstance(savepath, bool): save(fig, "hist", _mdf.name_) elif isinstance(savepath, str): save(fig, "hist", _mdf.name_, fp=savepath)
def scatter_grid( mdf: Union[DataFrame, "MetaPanda"], x: SelectorType, y: SelectorType, arrange: str = "square", plot_size: int = 3, best_fit: bool = True, best_fit_deg: int = 1, savepath: Optional[Union[bool, str]] = None, ): """ Plots a grid of scatter plots comparing each column for MetaPanda in selector to y target value. Parameters -------- mdf : turb.MetaPanda The dataset x : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings y : str or list/tuple of str Contains either types, meta column names, column names or regex-compliant strings arrange : str Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row prioritises plots row-like, and column-wise for column. plot_size : int The size of each axes best_fit : bool If True, draws a line of best fit best_fit_deg : int, default=1 The degree of the line of best fit, can draw polynomial savepath : None, bool, str saves the figure to file. If bool, uses the name in mdf, else uses given string. Returns ------- None """ from turbopanda.corr import bicorr # checks instance_check((plot_size, best_fit_deg), int) instance_check(savepath, (type(None), str, bool)) instance_check(best_fit, bool) nonnegative(( best_fit_deg, plot_size, )) belongs(arrange, ["square", "row", "column"]) # make a metapanda if we have a dataframe. _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf # get selector x_sel = _mdf.view(x) y_sel = _mdf.view(y) # create a product between x and y and plot prod = list(it.product(x_sel, y_sel)) if len(prod) > 0: fig, axes = gridplot(len(prod), arrange, ax_size=plot_size) for i, (_x, _y) in enumerate(prod): # pair x, y __x, __y = remove_na(_mdf[_x].values, _mdf[_y].values, paired=True) axes[i].scatter(__x.flatten(), __y, alpha=0.5) # line of best fit if best_fit: xn = np.linspace(__x.min(), __x.max(), 100) z = np.polyfit(__x.flatten(), __y, deg=best_fit_deg) axes[i].plot(xn, np.polyval(z, xn), "k--") # spearman correlation pair_corr = bicorr(_mdf[_x], _mdf[_y]).loc["spearman", "r"] axes[i].set_title("r={:0.3f}".format(pair_corr)) axes[i].set_xlabel(_x) axes[i].set_ylabel(_y) fig.tight_layout() if isinstance(savepath, bool): save(fig, "scatter", _mdf.name_) elif isinstance(savepath, str): save(fig, "scatter", _mdf.name_, fp=savepath)
def gridplot( n_plots: int, arrange: str = "square", ax_size: Union[int, Tuple[int, int]] = 2, annotate_labels: bool = False, annotate_offset: float = 0.01, **annotate_args ): """Determines the most optimal shape for a set of plots. Parameters ---------- n_plots : int The total number of plots. arrange : str, default="square" Choose from {'square', 'row' 'column'}. Indicates preference for direction of plots. ax_size : int, default=2 The square size of each plot. annotate_labels : bool, default=False If True, adds A, B,.. K label to top-left corner of each axes. annotate_offset : float, default=0.01 Determines the amount of offset for each label Returns ------- fig : matplotlib.figure.Figure The figure axes : list of matplotlib.ax.Axes A list of axes to use. """ instance_check(annotate_labels, bool) nonnegative((n_plots,), int) belongs(arrange, ["square", "row", "column"]) annot_props = { "weight": "bold", "horizontalalignment": "left", "verticalalignment": "center", } # update with args annot_props.update(annotate_args) if isinstance(ax_size, int): fs = np.array([ax_size, ax_size]) else: fs = np.array(ax_size) if n_plots == 1: fig, ax = plt.subplots(figsize=fs) # # wrap ax as a list to iterate over. if annotate_labels: fig.text(0.01, 0.98, "A", **annot_props) return fig, [ax] else: fig, ax = ( _generate_square_like_grid(n_plots, ax_size=fs) if arrange == "square" else _generate_diag_like_grid(n_plots, arrange, ax_size=fs) ) # add annotation labels, hmmm if annotate_labels: # we use tight layout to make sure text isnt overlapping fig.tight_layout() for a, n in zip(ax, string.ascii_uppercase): pos_ = a.get_position().bounds # add label fig.text( pos_[0] - annotate_offset, pos_[1] + pos_[3] + annotate_offset, n, **annot_props ) return fig, ax
def scatter_slim(X: _ArrayLike, Y: _ArrayLike, bins: Optional[int] = None, threshold: Union[int, float] = 50, **turbo_kws): """ Generates a slim-down scatterplot. This is useful where there are thousands of points overlapping, and for visualization and storage size, you only plot so many points within a given bin area. Parameters ---------- X : list/tuple/np.ndarray/pd.Series (1d) The data column to draw on the x-axis. Flattens if np.ndarray Y : list/tuple/np.ndarray/pd.Series (1d) The data column to draw on the y-axis. Flattens if np.ndarray bins : int, optional Specifies the bins to split X,Y domain, if optional this is optimized for threshold : int or float Specifies the threshold above which nsamples are dropped in each bin. If float, specifies the proportion of points [0..1] to keep in each bin. turbo_kws : dict Keyword arguments to pass to `turb.plot.scatter`. All other arguments go to `ax.scatter`. Returns ------- ax : matplotlib.ax object Allows further modifications to the axes post-scatter """ # defines some turbo keywords, everything else is scatter_kws turbo_keys = { 'c', 's', 'marker', 'dense', 'fit_line', 'ax', 'alpha', 'cmap', 'legend', 'colorbar', 'with_jitter', 'x_label', 'y_label', 'x_scale', 'y_scale', 'legend_outside', 'title', 'with_grid', 'fit_line_degree' } our_keys = set(turbo_kws.keys()) # intersection between the two. used_keys = turbo_keys & our_keys t_kws = {x: turbo_kws[x] for x in used_keys} mpl_kws = {x: turbo_kws[x] for x in our_keys - used_keys} # get subset where missing values from either are dropped _X = as_flattened_numpy(X) _Y = as_flattened_numpy(Y) # paired values _X, _Y = remove_na(_X, _Y, paired=True) # get the bins if bins is None: # we just use x here. bins_x = freedman_diaconis_bins(_X) bins_y = freedman_diaconis_bins(_Y) # take the average, integer divison bins = (bins_x + bins_y) // 2 else: # ensure its non-negative nonnegative(bins, int) # compute the binned density s, xs, ys = np.histogram2d(_X, _Y, bins=bins) xs_lw = xs[:-1] xs_up = xs[1:] ys_lw = ys[:-1] ys_up = ys[1:] indices = [] # loop through all the bins and compute a valid sample subset for i in range(bins): for j in range(bins): x_b = np.logical_and(_X >= xs_lw[i], _X < xs_up[i]) y_b = np.logical_and(_Y >= ys_lw[j], _Y < ys_up[j]) # indices i_b = np.argwhere(np.logical_and(x_b, y_b)).flatten() i_bn = i_b.shape[0] # if this is empty, do nothing else, select subset and return if i_bn > 0: samp_size = i_bn if type(threshold) == int: samp_size = min(i_bn, threshold) elif type(threshold) == float: samp_size = min(i_bn, int(i_bn * threshold)) # sample samp = np.random.choice(i_b, samp_size, replace=False) indices.append(samp) ni = np.hstack(indices) # x and y is now selected using ni return scatter(_X[ni], _Y[ni], **t_kws, **mpl_kws)
def test_nonnegative2(self, x): with pytest.raises(AttributeError): assert utils.nonnegative(x, int)
def test_nonnegative1(self, x): assert utils.nonnegative(x, int)