Beispiel #1
0
def select_xcols(df: pd.DataFrame, xs, y):
    """Selects the appropriate x-column selection from a dataset for ML use."""
    if xs is None:
        return df.columns.difference(pd.Index([y]))
    elif isinstance(xs, str):
        return pattern(xs, df.columns)
    else:
        return xs
Beispiel #2
0
def get_best_model(cv_results, minimize: bool = True):
    """Returns the best model (with correct params) given the cv_results from a `fit_grid` call.

    The idea behind this function is to fetch from the pool of models the best model
    which could be fed directly into `fit_basic` to get the detailed plots.

    Parameters
    ----------
    cv_results : MetaPanda
        A dataframe result from `.ml.fit.grid`
    minimize : bool
        Determines whether the scoring function is minimized or maximized

    Returns
    -------
    M : sklearn model
        A parameterized sklearn model (unfitted).

    Notes
    -----
    The returned model is not fitted, you will need to do this yourself.

    See Also
    --------
    fit_basic : Performs a rudimentary fit model with no parameter searching
    """
    if minimize:
        select = cv_results.df_["mean_test_score"].idxmin()
    else:
        select = cv_results.df_["mean_test_score"].idxmax()

    M = cv_results.df_.loc[select, "model"]
    # instantiate a model from text M
    inst_M = find_sklearn_model(M)[0]
    # get dict params
    param_columns = pattern("param_model__",
                            cv_results.df_.loc[select].dropna().index, False)
    # preprocess dict params to eliminate the header for sklearn models
    _old_params = cv_results.df_.loc[select, param_columns]
    _old_params.index = _old_params.index.str.rsplit("__", 1).str[-1]
    params = _old_params.to_dict()
    # iterate through parameters and cast down potential floats to ints
    for k, v in params.items():
        if isinstance(v, float):
            if v.is_integer():
                params[k] = int(v)

    # set parameters in to the model.
    inst_M.set_params(**params)
    return inst_M
Beispiel #3
0
def absolute(df: pd.DataFrame, pat: str = None) -> pd.DataFrame:
    """Performs subselected absolute operation on certain columns."""
    condition = (lambda x: list(pattern(pat, x, extended_regex=False))
                 if pat is not None else df.columns.tolist())
    return _multi_assign(df, np.abs, condition)
Beispiel #4
0
def best_model(cv_results,
               y_var: str = "test",
               minimize: bool = True,
               score: str = "RMSE",
               **box_kws):
    """Determines the best model (min or max) and plots the boxplot of all resulting best models.

    Parameters
    ----------
    cv_results : MetaPanda
        The results from a call to `fit_grid`.
    y_var : str
        Choose from {'test', 'train'}
        If 'test': draws the test score
        If 'train': draws the training score
    minimize : bool
        If True, selects best smallest score, else select best largest score
    score : str
        The name of the scoring function
    box_kws : dict, optional
        Keyword arguments to pass to `plt.boxplot`.

    Returns
    -------
    fig : matplotlib.figure
        The figure object
    """
    instance_check(minimize, bool)
    instance_check(score, str)
    belongs(y_var, ("train", "test"))

    sely = pattern("mean_%s_score" % y_var, cv_results.columns, False)
    # create figures
    fig = plt.figure(figsize=(8, 5))
    ax = fig.add_subplot(111)
    # create a copy
    res = cv_results.df_ if not isinstance(cv_results,
                                           pd.DataFrame) else cv_results
    # transform.
    if res[sely].squeeze().mean() < 0.0:
        res = res.pipe(absolute, "(?:split[0-9]+|mean)_(?:train|test)_score")
    # for each 'model', arrange data into boxplot
    if minimize:
        indices = res.groupby("model")[sely].idxmin()
    else:
        indices = res.groupby("model")[sely].idxmax()
    # arrange data
    result_p = res.df_.loc[indices, res.view("split[0-9]+_%s_score" % y_var)]
    # reorder based on the best score
    re_order = result_p.median(axis=1).sort_values()
    result_p = result_p.reindex(re_order.index)
    # get best score name
    indices = switcheroo(indices).reindex(re_order.index)
    # plot
    bp = ax.boxplot(result_p, patch_artist=True, showfliers=False, **box_kws)
    # fetch package names and map them to colors - returned as pd.Series
    packages = find_model_family(indices.values)
    # map colors to each of the packages.
    mapping = dictzip(set_like(packages),
                      color_qualitative(len(set_like(packages))))
    mapped_cols = packages.map(mapping)
    # iterate over boxes and colour
    for box, col in zip(bp["boxes"], mapped_cols):
        box.set(facecolor=col, linewidth=1.2)
    plt.setp(bp["medians"], linewidth=1.5)
    # additional box requirements
    ax.set_xlabel("Model")
    ax.set_ylabel("%s %s" % (y_var, score))
    ax.set_xticklabels(indices.values)
    ax.tick_params("x", rotation=45)
    ax.grid()
    for tick in ax.get_xmajorticklabels():
        tick.set_horizontalalignment("right")
    # generate legend
    ax.legend(legend_line(mapping),
              list(mapping.keys()),
              bbox_to_anchor=(1.03, 1.03))
    plt.show()

    return fig
Beispiel #5
0
def correlate(
        data: Union[pd.DataFrame, MetaPanda],
        x: Optional[SelectorType] = None,
        y: Optional[SelectorType] = None,
        covar: Optional[SelectorType] = None,
        cartesian_covar: bool = False,
        output: str = "full",
        method: str = "spearman",
        verbose: int = 0,
) -> pd.DataFrame:
    """Correlates X and Y together to generate a list of correlations.

    If X/Y are MetaPandas, returns a MetaPanda object, else returns pandas.DataFrame

    Parameters
    ---------
    data : pd.DataFrame / MetaPanda
        The full dataset.
    x : (str, list, tuple, pd.Index), optional
        Subset of input(s) for column names.
            if None, uses the full dataset. Y must be None in this case also.
    y : (str, list, tuple, pd.Index), optional
        Subset of output(s) for column names.
            if None, uses the full dataset (from optional `x` subset)
    covar : (str, list, tuple, pd.Index), optional
        set of covariate(s). Covariates are needed to compute partial correlations.
            If None, uses standard correlation.
    cartesian_covar : bool, default=False
        If True, and if covar is not None, separates every
            element in covar to individually control for
        using the cartesian product
    output : str, default="full"
        Choose from {'full', 'score'}. Score just returns `r` number.
    method : str, default="spearman"
        Method to correlate with. Choose from:
            'pearson' : Pearson product-moment correlation
            'spearman' : Spearman rank-order correlation
            'kendall' : Kendall’s tau (ordinal data)
            'biserial' : Biserial correlation (continuous and boolean data only)
            'percbend' : percentage bend correlation (robust)
            'shepherd' : Shepherd's pi correlation (robust Spearman)
            'skipped' : skipped correlation (robust Spearman, requires sklearn)
    verbose : int, default=0
        If > 0, prints out useful debugging messages

    Returns
    -------
    R : pd.DataFrame
        correlation rows (based on pingouin structure)

    Examples
    --------
    >>> import turbopanda as turb
    >>> data = turb.read('example.json')
    >>> R = turb.correlate(data) # uses full dataset
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000
    >>> R = turb.correlate(data, x=('X', 'M', 'Y')) # uses subset of dataset
                 X         M         Y
    X     1.000000  0.392251  0.059771
    M     0.392251  1.000000  0.545618
    Y     0.059771  0.545618  1.000000

    # correlates X columns against Ybin
    >>> R = turb.correlate(data, x=('X', 'M', 'Y'), y='Ybin')
                    X         M         Y
    Ybin     1.000000  0.392251  0.059771

    # correlates X against Ybin controlling for
    >>> R = turb.correlate(data, x='X', y='Ybin', covar='Y') Y
                     X
    Ybin     -0.149210

    # using a different technique
    >>>  R = turb.correlate(data, method="shepherd")
                 X         M         Y      Mbin      Ybin
    X     1.000000  0.392251  0.059771 -0.014405 -0.149210
    M     0.392251  1.000000  0.545618 -0.015622 -0.094309
    Y     0.059771  0.545618  1.000000 -0.007009  0.161334
    Mbin -0.014405 -0.015622 -0.007009  1.000000 -0.076614
    Ybin -0.149210 -0.094309  0.161334 -0.076614  1.000000
    """

    # data cannot be NONE
    instance_check(data, (pd.DataFrame, MetaPanda))
    instance_check((x, y, covar), (type(None), str, list, tuple, pd.Index))
    instance_check(cartesian_covar, bool)
    belongs(
        method,
        (
            "pearson",
            "spearman",
            "kendall",
            "biserial",
            "percbend",
            "shepherd",
            "skipped",
        ),
    )
    belongs(output, ("full","score"))
    bounds_check(verbose, 0, 4)

    # downcast to dataframe option
    df = data.df_ if not isinstance(data, pd.DataFrame) else data
    # downcast if list/tuple/pd.index is of length 1
    x = x[0] if (isinstance(x, (tuple, list, pd.Index)) and len(x) == 1) else x
    y = y[0] if (isinstance(y, (tuple, list, pd.Index)) and len(y) == 1) else y

    # convert using `view` if we have string instances.
    if isinstance(x, str):
        x = pattern(x, df.columns)
    if isinstance(y, str):
        y = pattern(y, df.columns)
    if isinstance(covar, str):
        covar = pattern(covar, df.columns)

    # perform a check to make sure every column in `covar` is continuous.
    if covar is not None:
        if not is_dataframe_float(data[covar]):
            raise TypeError(
                "`covar` variables in `correlate` all must be of type `float`/continuous."
            )

    # execute various use cases based on the presense of x, y, and covar, respectively.
    if x is None and y is None:
        # here just perform matrix-based correlation
        comb = it.combinations_with_replacement(df.columns, 2)
        niter = (df.columns.shape[0]**2) // 2 + (df.columns.shape[0] // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and y is None:
        # use a subset of x, in union with covar
        comb = it.combinations_with_replacement(x, 2)
        niter = (len(x)**2) // 2 + (len(x) // 2)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(y, str):
        # list of x, y str -> matrix-vector cartesian product
        comb = it.product(x, [y])
        niter = len(x)
    elif isinstance(y, (list, tuple, pd.Index)) and isinstance(x, str):
        # list of y, x str -> matrix-vector cartesian product
        comb = it.product(y, [x])
        niter = len(y)
    elif isinstance(x, (list, tuple, pd.Index)) and isinstance(
            y, (list, tuple, pd.Index)
    ):
        # list of x, y -> cartesian product of x: y terms
        comb = it.product(x, y)
        niter = len(x) * len(y)
    else:
        raise ValueError("X: {}; Y: {}; Z: {} combination unknown.".format(x, y, covar))
    # return the combination of these effects.
    return _corr_combination(
        df, comb, niter, covar, cartesian_covar, method, output, verbose
    )
Beispiel #6
0
def melt(
    df,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name=None,
    index_name="index",
    include_index=True,
    include_regex=True,
    include_question_guess=True,
):
    """Unpivot a DataFrame from wide format to long format, optionally
    leaving identifier variables set.

    .. note:: Does not accept MultIndex pandas.dataFrames.

    Parameters
    ----------
    df : DataFrame
    id_vars : str, tuple, list or ndarray, optional
        Column(s) to use as identifier variables.
        If None: No identifier columns are used
        If str: uses a regex pattern if `include_regex` is True
    value_vars : str, tuple, list, or ndarray, optional
        Column(s) to unpivot. If not specified, uses all columns that
            are not set as `id_vars`
        If str: uses a regex pattern if `include_regex` is True
    var_name : str, optional
        Name to use for the `variable` column. If None it uses the `strategy`
            variable to find the common substring of the names
    value_name : str, optional
        Name to use for the `value` column. If None it uses the `strategy`
            variable to find the common substring of the names
    index_name : str, default="index"
        A name to give to the index if it doesn't have a name value
    include_index : bool, default=True
        If True, it includes the current index column(s) into the `id_vars`
    include_regex : bool, default=True
        If True, uses regular expressions for `id_vars` and `value_vars`
            if they are `str`
    include_question_guess : bool, default=True
        If True, strategy-generated names have a question mark `?` after them

    Returns
    -------
    dfn : pd.DataFrame
        New melted DataFrame

    See Also
    --------
    pandas.DataFrame.melt
    pandas.DataFrame.pivot_table
    """
    # check inputs
    instance_check(df, pd.DataFrame)
    instance_check(
        (id_vars, value_vars),
        (type(None), str, list, tuple, np.ndarray, pd.Series, pd.Index),
    )
    instance_check((var_name, value_name, index_name), (type(None), str))
    instance_check((include_regex, include_question_guess, include_index),
                   bool)

    _columns = df.columns.tolist()
    _index = df.index

    # perform regex options for id vars and value vars
    if isinstance(id_vars, str) and include_regex:
        # convert to list
        id_vars = pattern(id_vars, df)
    if isinstance(value_vars, str) and include_regex:
        # convert to list
        value_vars = pattern(value_vars, df)

    if id_vars is None:
        if value_vars is not None:
            id_vars = list(set(_columns) - set(value_vars))
        else:
            id_vars = []
    else:
        id_vars = list(id_vars)

    if value_vars is None:
        if id_vars is not None:
            value_vars = list(set(_columns) - set(id_vars))
        else:
            value_vars = _columns
    else:
        value_vars = list(value_vars)

    # if we include the index, we need to reset it
    if include_index:
        # add in the index cols into the data
        df = df.reset_index().rename(columns={"index": index_name})
        # rename index
        if _index.name is not None:
            id_vars.append(_index.name)
        else:
            id_vars.append(index_name)

    # update var_name
    if var_name is None:
        # use common_substring in the id_vars columns
        valns = common_substrings(value_vars)
        if isinstance(valns, pd.Series) and valns.shape[0] > 0:
            _var_name = valns.idxmax()
            # if we have question guess, add it on
        elif isinstance(valns, str):
            _var_name = valns
        elif df.columns.name != "":
            _var_name = df.columns.name
        else:
            _var_name = "variable"
        if include_question_guess:
            _var_name += "?"
    else:
        _var_name = var_name

    if value_name is None:
        _value_name = "value"
    else:
        _value_name = value_name

    return pd.melt(df, id_vars, value_vars, _var_name, _value_name)