Example #1
0
def common_substrings(
        a: Union[str, List[str]],
        b: Optional[Union[str, List[str]]] = None,
        min_length: int = 2,
) -> Union[str, Series]:
    """Given at least one pair of strings, find all the best common substring matches.

    By default, if one a is passed, it uses the pairwise combinations between all values in the list,
        otherwise with a + b, the cartesian product of the lists is used.

    Parameters
    ----------
    a : str/list of str
        A word or list of words to find the common substring to
    b : str/list of str, optional
        A word or list of words to find the common substring to
        If None, pairwise combinations in a are used
    min_length: int, default=2
        The minimum accepted length of string for a given pair

    Returns
    -------
    z_up : str/Series
        str returned if (a, b) are strs, else Series of valuecounts
    """

    instance_check(a, (str, list, tuple, Index))
    instance_check(b, (type(None), str, list, tuple, Index))
    nonnegative(min_length, int)
    # prevent a case where a can be a str, b is None
    disallow_instance_pair(a, str, b, type(None))

    filters = ("", "_", "__", "-")
    if isinstance(a, str) and isinstance(b, type(None)):
        return a
    elif isinstance(a, str) and isinstance(b, str):
        return _single_common_substring_match(a, b)
    else:
        if isinstance(a, str):
            a = [a]
        elif isinstance(b, str):
            b = [b]
        # determine pair set.
        if b is None:
            # combination iterator
            pair_groups = it.combinations(a, 2)
        else:
            # cartesian product iterator
            pair_groups = it.product(a, b)
        # generate pairs
        z = [_single_common_substring_match(i, j) for i, j in pair_groups]

        def filter_func(x):
            """Custom function which filters according to tuple and keeps elements >= min length"""
            return (x in filters) or (len(x) < min_length) or (z.count(x) <= 1)

        # filter out naff elements
        z_up = list(it.filterfalse(filter_func, z))
        # save as series valuecounts.
        return Series(z_up, dtype=object).value_counts()
Example #2
0
def density(
    X: np.ndarray,
    Y: Optional[np.ndarray] = None,
    Z: Optional[np.ndarray] = None,
    r: Optional[int] = None,
) -> np.ndarray:
    """Estimates the density of X using binning, accepts np.ndarray.

    Parameters
    ----------
    X : np.ndarray (n,)
        The first dimension
    Y : np.ndarray (n,), optional
        The second dimension
    Z : np.ndarray (n,), optional
        The third dimension
    r : int, optional
        The number of bins for each dimension,
        If None, uses the freedman-diaconis rule

    Returns
    -------
    d : np.ndarray (r,...)
        The density in binned-dimensions
    """
    instance_check(X, np.ndarray)
    instance_check((Y, Z), (type(None), np.ndarray))
    instance_check(r, (type(None), int))

    if r is None:
        r = min(freedman_diaconis_bins(X), 50)
    else:
        nonnegative(r, int)

    if Y is None and Z is None:
        _X = remove_na(X)
        return np.histogram(_X, bins=r, density=True)[0]
    elif Z is None:
        _X, _Y = remove_na(X, Y, paired=True)
        return np.histogram2d(_X, _Y, bins=(r, r), density=True)[0]
    else:
        return np.histogramdd(np.vstack((X, Y, Z)).T,
                              bins=(r, r, r),
                              density=True)[0]
Example #3
0
def head(self, k: int = 5) -> pd.DataFrame:
    """Look at the top k rows of the dataset.

    See `pd.DataFrame.head` documentation for details.

    Parameters
    --------
    k : int, optional
        Must be 0 < k < n.

    Returns
    -------
    ndf : pandas.DataFrame
        First k rows of df_

    See Also
    --------
    pandas.DataFrame.head : Return the first n rows.
    """
    nonnegative(k, int)
    return self.df_.head(k)
Example #4
0
def covariance_matrix(p, corr_ratio=0.5, diag_var=1., random_direction=False):
    """Generates a randomly-generated 'correlated' covariance matrix.

    This is useful in situations where you want to create correlated synthetic
    data to test an algorithm.

    Bare in mind that `corr_ratio` follows the relationship rho = [1/p-1, 1], so negative
    correlations will be clipped at higher dimensions to ensure semipositive definite structures.

    Parameters
    ----------
    p : int
        The number of dimensions. p must be >= 2
    corr_ratio : float [-1..1]
        The proportion of 'correlation' within the matrix; 0 no correlation, 1 full positive correlation
            and -1 full negative correlation.
    diag_var : float
        The values on the diagonal, with small error (5e-03)
    random_direction : bool, default=False
        Correlation is randomly positive or negative if True, else uses the sign(corr_ratio).

    Returns
    -------
    cov : np.ndarray((p, p))
        Covariance matrix
    """
    nonnegative(p, int)
    # p must be greater than 1 to be multivariate gaussian.
    if p < 2:
        raise ValueError("'p' must be > 1")
    # clips ratio into the range [0, 1]
    _corr_ratio = np.clip(corr_ratio, 1. / (p - 1), 0.999)

    if not np.isclose(corr_ratio, _corr_ratio):
        warnings.warn(
            "`corr_ratio` parameter is clipped from {:0.3f} to [{:0.3f}, 1]".
            format(corr_ratio, _corr_ratio))

    return _create_cov_matrix(p, _corr_ratio, diag_var, random_direction)
Example #5
0
def optimize(df: "MetaPanda",
             x: SelectorType,
             y: str,
             models,
             cv: int = 5,
             verbose: int = 0):
    """Performs optimization grid analysis on the models selected.

    This uses `scipy.optimize` function to minimize continuous parameters, for example `alpha` in a Lasso model.

    .. note:: optimization only works on *continuous* parameters with each model.

    TODO: complete `.ml.fit.optimize` function

    Parameters
    ----------
    df : MetaPanda
        The main dataset.
    x : list/tuple of str
        A list of selected column names for x or MetaPanda `selector`.
    y : str
        A selected y column.
    models : tuple/dict
        tuple: list of model names, uses default parameters
        dict: key (model name), value tuple (parameter names) / dict: key (parameter name), value (list of values)
    cv : int/tuple, optional (5, 10)
        If int: just reflects number of cross-validations
        If Tuple: (cross_validation, n_repeats) `for RepeatedKFold`
    cache : str, optional
        If not None, cache is a filename handle for caching the `cv_results` as a JSON/csv file.
    plot : bool, optional
        If True, produces appropriate plot determining for each parameter.
    chunks : bool, optional
        If True, and if cache is not None: caches the ML gridsearch into equal-sized chunks.
        This saves chunk files which means that if part of the pipeline breaks, you can start from the previous chunk.
    verbose : int, optional
        If > 0, prints out statements depending on level.

    Returns
    -------
    cv_results : MetaPanda
        A dataframe result from GridSearchCV detailing iterations and all scores.

    By default, `optimize` tunes using the root mean squared error (RMSE).
       There is currently no option to change this.

    By default, this model assumes you are working with a regression problem. Classification compatibility
        will arrive in a later version.

    See Also
    --------
    grid : Performs exhaustive grid search analysis on the models selected.
    sklearn.model_selection.GridSearchCV : Exhaustive search over specified parameter values for an estimator

    References
     ----------
    .. [1] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.
    """
    # checks
    instance_check(df, MetaPanda)
    instance_check(x, (str, list, tuple, pd.Index))
    instance_check(y, str)
    nonnegative((cv, verbose), int)
    instance_check(models, (tuple, list, dict))
    bounds_check(verbose, 0, 4)

    _df = df.df_ if not isinstance(df, pd.DataFrame) else df
    _xcols = select_xcols(_df, x, y)
    _xnp, _y = preprocess_continuous_X_y(_df, _xcols, y)

    # define the parameter sets
    param_sets = make_optimize_grid(models)

    for m, params in zip(models, param_sets):
        model = find_sklearn_model(m)[0]
        inits, bounds = optimize_grid_for_model(params)
        # minimize for every i element
        mins = [
            so.minimize(
                _min_cross_val_scores,
                x0=i,
                args=(_xnp, _y, model, params, cv),
                bounds=bounds,
            ) for i in inits
        ]

    pass
Example #6
0
def overview_pca(
    model,
    distance_color: bool = True,
    labels: Optional[pd.Index] = None,
    cutoff_selection: float = 0.9,
    n_samples_annotate: int = 6,
    n_pcs: int = 5,
    ax_size: int = 4,
):
    """Provides an overview plot from a PCA result.

    Parameters
    ----------
    model : sklearn.decomposition.PCA
        A fitted PCA model.
    distance_color : bool, default=True
        If True, plots the magnitude of each PC as a color
    labels : np.ndarray (n,) of str / pd.Series / list / tuple, optional
        If not None, provides a label for every PC component (dimension), and annotates
        the most 'outlier' like samples in plot 1
    cutoff_selection : float, default=0.9
        The cutoff for proportional variance to select for
    n_samples_annotate : int, default=10
        Defines the number of labels to show if `labels` is not None in plot 1
    n_pcs : int, default=5
        The number of principle components to consider in plot 3
    ax_size : int, default=4
        The default size for each axes.

    Other Parameters
    ----------------
    scatter_kws : dict
        keywords to pass to `plt.scatter`
    """
    instance_check(distance_color, bool)
    instance_check(labels, (type(None), np.ndarray, pd.Series, pd.Index, list, tuple))
    nonnegative(
        (
            n_samples_annotate,
            n_pcs,
            ax_size,
        ),
        int,
    )

    if labels is not None:
        fig, axes = gridplot(3, ax_size=ax_size)
    else:
        fig, axes = gridplot(2, ax_size=ax_size)

    if n_samples_annotate > model.n_components_:
        n_samples_annotate = model.n_components_ - 1
    if n_pcs > model.n_components_:
        n_pcs = model.n_components_ - 1

    # 1 plot the scatter of PC
    _plot_pca_scatter(model, axes[0], distance_color)
    # 2 plot the line AUC for explained variance
    _explained_variance_plot(model, axes[1], cutoff=cutoff_selection)
    # if annotate, we annotate the scatter plot with samples.
    if labels is not None:
        # check to make sure labels is same length as components
        _annotate_on_magnitude(model, labels, n_samples_annotate, axes[0])
        # 3 plot the top N components by the `most important eigenvector values`
        _x3, _y3, _sel_labels = _best_principle_eigenvectors(
            model, labels=labels, k=n_samples_annotate, p=n_pcs
        )
        _best_eigenvector_plot(
            _x3, _y3, _sel_labels, axes[-1], nk=(n_samples_annotate, n_pcs)
        )
        axes[-1].set_title("Top {} eigenvectors".format(n_samples_annotate))

    fig.tight_layout()
Example #7
0
def hist_grid(mdf: Union[DataFrame, "MetaPanda"],
              subset: SelectorType,
              arrange: str = "square",
              plot_size: int = 3,
              shared_dist: str = "auto",
              savepath: Optional[Union[str, bool]] = None,
              **hist_kws):
    """
    Plots a grid of histograms comparing the distributions in a MetaPanda
    selector.

    Parameters
    --------
    mdf : turb.MetaPanda
        The dataset
    subset : str or list/tuple of str
        Contains either types, meta column names, column names or regex-compliant strings
    arrange : str
        Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row
        prioritises plots row-like, and column-wise for column.
    plot_size : int, default=3
        The size of each axes
    shared_dist : str/tuple of str/dict, default="auto"
        Determines what KDE to fit to the data, set to None if you don't want
        If tuple/list: attempts using these specified distributions
        If dict: maps column name (k) to distribution choice (v)
    savepath : None, bool, str
        saves the figure to file. If bool, uses the name in mdf, else uses given string. If None, no fig is saved.

    Other Parameters
    ----------------
    hist_kws : dict
        Keywords to pass to `turb.plot.histogram`

    Returns
    -------
    None
    """
    # checks
    instance_check(shared_dist, (type(None), str, list, tuple, dict))
    instance_check(savepath, (type(None), str, bool))
    nonnegative(plot_size, int)
    belongs(arrange, ["square", "row", "column"])
    # make a metapanda if we have a dataframe.
    _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf

    # get selector
    selection = _mdf.view(subset)
    # assuming we've selected something...
    if selection.size > 0:
        fig, axes = gridplot(len(selection), arrange, ax_size=plot_size)

        if not isinstance(shared_dist, dict):
            for i, x in enumerate(selection):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=shared_dist,
                              **hist_kws)
            fig.tight_layout()
        else:
            for i, (x, d) in enumerate(shared_dist.items()):
                _ = histogram(_mdf[x].dropna(),
                              ax=axes[i],
                              title=x,
                              kde=d,
                              **hist_kws)
            # iterate over any 'remaining' columns in selection and handle appropriately
            remaining = difference(selection, tuple(shared_dist.keys()))
            if remaining.shape[0] > 0:
                for i, x in enumerate(remaining):
                    _ = histogram(_mdf[x].dropna(),
                                  ax=axes[i + len(shared_dist)],
                                  title=x,
                                  kde="auto",
                                  **hist_kws)
            fig.tight_layout()

        if isinstance(savepath, bool):
            save(fig, "hist", _mdf.name_)
        elif isinstance(savepath, str):
            save(fig, "hist", _mdf.name_, fp=savepath)
Example #8
0
def scatter_grid(
    mdf: Union[DataFrame, "MetaPanda"],
    x: SelectorType,
    y: SelectorType,
    arrange: str = "square",
    plot_size: int = 3,
    best_fit: bool = True,
    best_fit_deg: int = 1,
    savepath: Optional[Union[bool, str]] = None,
):
    """
    Plots a grid of scatter plots comparing each column for MetaPanda
    in selector to y target value.

    Parameters
    --------
    mdf : turb.MetaPanda
        The dataset
    x : str or list/tuple of str
            Contains either types, meta column names, column names or regex-compliant strings
    y : str or list/tuple of str
            Contains either types, meta column names, column names or regex-compliant strings
    arrange : str
        Choose from ['square', 'row', 'column']. Square arranges the plot as square-like as possible. Row
        prioritises plots row-like, and column-wise for column.
    plot_size : int
        The size of each axes
    best_fit : bool
        If True, draws a line of best fit
    best_fit_deg : int, default=1
        The degree of the line of best fit, can draw polynomial
    savepath : None, bool, str
        saves the figure to file. If bool, uses the name in mdf, else uses given string.

    Returns
    -------
    None
    """
    from turbopanda.corr import bicorr

    # checks
    instance_check((plot_size, best_fit_deg), int)
    instance_check(savepath, (type(None), str, bool))
    instance_check(best_fit, bool)
    nonnegative((
        best_fit_deg,
        plot_size,
    ))
    belongs(arrange, ["square", "row", "column"])

    # make a metapanda if we have a dataframe.
    _mdf = MetaPanda(mdf) if isinstance(mdf, DataFrame) else mdf

    # get selector
    x_sel = _mdf.view(x)
    y_sel = _mdf.view(y)
    # create a product between x and y and plot
    prod = list(it.product(x_sel, y_sel))

    if len(prod) > 0:
        fig, axes = gridplot(len(prod), arrange, ax_size=plot_size)
        for i, (_x, _y) in enumerate(prod):
            # pair x, y
            __x, __y = remove_na(_mdf[_x].values, _mdf[_y].values, paired=True)
            axes[i].scatter(__x.flatten(), __y, alpha=0.5)
            # line of best fit
            if best_fit:
                xn = np.linspace(__x.min(), __x.max(), 100)
                z = np.polyfit(__x.flatten(), __y, deg=best_fit_deg)
                axes[i].plot(xn, np.polyval(z, xn), "k--")

            # spearman correlation
            pair_corr = bicorr(_mdf[_x], _mdf[_y]).loc["spearman", "r"]
            axes[i].set_title("r={:0.3f}".format(pair_corr))
            axes[i].set_xlabel(_x)
            axes[i].set_ylabel(_y)

        fig.tight_layout()

        if isinstance(savepath, bool):
            save(fig, "scatter", _mdf.name_)
        elif isinstance(savepath, str):
            save(fig, "scatter", _mdf.name_, fp=savepath)
Example #9
0
def gridplot(
    n_plots: int,
    arrange: str = "square",
    ax_size: Union[int, Tuple[int, int]] = 2,
    annotate_labels: bool = False,
    annotate_offset: float = 0.01,
    **annotate_args
):
    """Determines the most optimal shape for a set of plots.

    Parameters
    ----------
    n_plots : int
        The total number of plots.
    arrange : str, default="square"
        Choose from {'square', 'row' 'column'}. Indicates preference for direction of plots.
    ax_size : int, default=2
        The square size of each plot.
    annotate_labels : bool, default=False
        If True, adds A, B,.. K label to top-left corner of each axes.
    annotate_offset : float, default=0.01
        Determines the amount of offset for each label

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure
    axes : list of matplotlib.ax.Axes
        A list of axes to use.
    """
    instance_check(annotate_labels, bool)
    nonnegative((n_plots,), int)
    belongs(arrange, ["square", "row", "column"])

    annot_props = {
        "weight": "bold",
        "horizontalalignment": "left",
        "verticalalignment": "center",
    }
    # update with args
    annot_props.update(annotate_args)
    if isinstance(ax_size, int):
        fs = np.array([ax_size, ax_size])
    else:
        fs = np.array(ax_size)

    if n_plots == 1:
        fig, ax = plt.subplots(figsize=fs)  #
        # wrap ax as a list to iterate over.
        if annotate_labels:
            fig.text(0.01, 0.98, "A", **annot_props)
        return fig, [ax]
    else:
        fig, ax = (
            _generate_square_like_grid(n_plots, ax_size=fs)
            if arrange == "square"
            else _generate_diag_like_grid(n_plots, arrange, ax_size=fs)
        )
        # add annotation labels, hmmm
        if annotate_labels:
            # we use tight layout to make sure text isnt overlapping
            fig.tight_layout()
            for a, n in zip(ax, string.ascii_uppercase):
                pos_ = a.get_position().bounds
                # add label
                fig.text(
                    pos_[0] - annotate_offset,
                    pos_[1] + pos_[3] + annotate_offset,
                    n,
                    **annot_props
                )
        return fig, ax
Example #10
0
def scatter_slim(X: _ArrayLike,
                 Y: _ArrayLike,
                 bins: Optional[int] = None,
                 threshold: Union[int, float] = 50,
                 **turbo_kws):
    """
    Generates a slim-down scatterplot.

    This is useful where there are thousands of points overlapping, and for visualization and storage size,
    you only plot so many points within a given bin area.

    Parameters
    ----------
    X : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the x-axis. Flattens if np.ndarray
    Y : list/tuple/np.ndarray/pd.Series (1d)
        The data column to draw on the y-axis. Flattens if np.ndarray
    bins : int, optional
        Specifies the bins to split X,Y domain, if optional this is optimized for
    threshold : int or float
        Specifies the threshold above which nsamples are dropped in each bin.
        If float, specifies the proportion of points [0..1] to keep in each bin.
    turbo_kws : dict
        Keyword arguments to pass to `turb.plot.scatter`. All other arguments go to `ax.scatter`.

    Returns
    -------
    ax : matplotlib.ax object
        Allows further modifications to the axes post-scatter
    """

    # defines some turbo keywords, everything else is scatter_kws
    turbo_keys = {
        'c', 's', 'marker', 'dense', 'fit_line', 'ax', 'alpha', 'cmap',
        'legend', 'colorbar', 'with_jitter', 'x_label', 'y_label', 'x_scale',
        'y_scale', 'legend_outside', 'title', 'with_grid', 'fit_line_degree'
    }

    our_keys = set(turbo_kws.keys())
    # intersection between the two.
    used_keys = turbo_keys & our_keys
    t_kws = {x: turbo_kws[x] for x in used_keys}
    mpl_kws = {x: turbo_kws[x] for x in our_keys - used_keys}

    # get subset where missing values from either are dropped
    _X = as_flattened_numpy(X)
    _Y = as_flattened_numpy(Y)
    # paired values
    _X, _Y = remove_na(_X, _Y, paired=True)

    # get the bins
    if bins is None:
        # we just use x here.
        bins_x = freedman_diaconis_bins(_X)
        bins_y = freedman_diaconis_bins(_Y)
        # take the average, integer divison
        bins = (bins_x + bins_y) // 2
    else:
        # ensure its non-negative
        nonnegative(bins, int)

    # compute the binned density
    s, xs, ys = np.histogram2d(_X, _Y, bins=bins)
    xs_lw = xs[:-1]
    xs_up = xs[1:]
    ys_lw = ys[:-1]
    ys_up = ys[1:]

    indices = []
    # loop through all the bins and compute a valid sample subset
    for i in range(bins):
        for j in range(bins):
            x_b = np.logical_and(_X >= xs_lw[i], _X < xs_up[i])
            y_b = np.logical_and(_Y >= ys_lw[j], _Y < ys_up[j])
            # indices
            i_b = np.argwhere(np.logical_and(x_b, y_b)).flatten()
            i_bn = i_b.shape[0]
            # if this is empty, do nothing else, select subset and return
            if i_bn > 0:
                samp_size = i_bn
                if type(threshold) == int:
                    samp_size = min(i_bn, threshold)
                elif type(threshold) == float:
                    samp_size = min(i_bn, int(i_bn * threshold))
                # sample
                samp = np.random.choice(i_b, samp_size, replace=False)
                indices.append(samp)

    ni = np.hstack(indices)
    # x and y is now selected using ni
    return scatter(_X[ni], _Y[ni], **t_kws, **mpl_kws)
Example #11
0
 def test_nonnegative2(self, x):
     with pytest.raises(AttributeError):
         assert utils.nonnegative(x, int)
Example #12
0
 def test_nonnegative1(self, x):
     assert utils.nonnegative(x, int)