Beispiel #1
0
    def test_union(self):
        x = ["fi", "fo", "fum"]
        y = ["fi", "yo", "sum"]
        z = ["fi", "fe", "sun"]

        assert np.all(
            utils.union(x, y) == pd.Index(["fi", "fo", "fum", "sum", "yo"]))
        assert np.all(
            utils.union(x, y, z) == pd.Index(
                ["fe", "fi", "fo", "fum", "sum", "sun", "yo"]))
Beispiel #2
0
def _write_json(self, filename: str):
    # update meta information
    self.update_meta()
    # columns founded by meta_map are dropped
    redundant_meta = union(list(default_columns().keys()), list(self.mapper_.keys()))
    reduced_meta = self.meta_.drop(redundant_meta, axis=1, errors="ignore")
    # encode data
    stringed_data = self.df_.to_json(double_precision=12)
    stringed_meta = (
        reduced_meta.to_json(double_precision=12) if reduced_meta.shape[1] > 0 else "{}"
    )
    # generate checksum - using just the column names.
    checksum = hashlib.sha256(
        json.dumps(self.df_.columns.tolist()).encode()
    ).hexdigest()
    # compilation string
    compile_string = (
        '{"data":%s,"meta":%s,"name":%s,"cache":%s,"mapper":%s,"checksum":%s}'
        % (
            stringed_data,
            stringed_meta,
            json.dumps(self.name_),
            json.dumps(self.selectors_),
            json.dumps(self.mapper_),
            json.dumps(checksum),
        )
    )
    # determine file name.
    fn = filename if filename is not None else self.name_ + ".json"
    with open(fn, "wb") as f:
        f.write(compile_string.encode())
Beispiel #3
0
def _partial_bicorr_inner(data: pd.DataFrame,
                          x,
                          y,
                          covar,
                          tail: str = "two-sided",
                          method: str = "spearman",
                          output: str = "score"):
    """Internal method for partial bi correlation here."""
    # all columns select
    if verbose > 0:
        print("partial {}:{}\\{}".format(x, y, covar))
    col = union(x, y, covar)
    """ Calculate linear models here to get residuals for x, y to correlate together. """
    # Drop rows with NaN
    _data = data[col].dropna()
    # use linear model to generate predictions
    px, r_x = lm(_data[covar], _data[x])
    py, r_y = lm(_data[covar], _data[y])
    # wrap residuals as series
    # if one is a boolean operation, we must preserve structure
    res_x = pd.Series(r_x, name=x)
    res_y = pd.Series(r_y, name=y)
    """ Perform bivariate correlate as normal. """
    # calculate bicorrelation on residuals

    if output == "score":
        return _bicorr_inner_score(res_x, res_y, method)
    else:
        return _bicorr_inner_full(res_x, res_y, method=method, tail=tail)
Beispiel #4
0
def _create_new_metamap(df, meta, selectors, mapper, name, meta_set):
    # for each selector, get the group view.
    if isinstance(meta_set, (list, tuple)):
        cnames = [inspect(df, meta, selectors, sel, mode="view") for sel in meta_set]
    else:
        raise TypeError("'selectors' must be of type {list, tuple}")

    # calculate the pairwise intersection between all the cnames
    igrid = union(*pairwise(intersect, cnames))

    if len(igrid) == 0:
        new_grid = pd.concat(
            [pd.Series(n, index=val) for n, val in zip(meta_set, cnames)],
            sort=False,
            axis=0,
        )
        new_grid.name = name
    else:
        raise ValueError("shared terms: {} discovered for meta_map.".format(igrid))
    # merge into meta
    cat = object_to_categorical(new_grid, meta_set)
    cat.name = name
    # APPARENTLY CONCAT doesn't work here? for some dumb reason.
    meta[name] = cat
    # store meta_map for future reference.
    mapper[name] = meta_set
Beispiel #5
0
def preprocess_continuous_X_y(df, xcols, ycols, for_sklearn=True):
    """Preprocess and split dataframe into X and y machine-learning ready datasets.

    Preprocesses especially for sklearn estimator object fit methods.

    Parameters
    ----------
    df : pd.DataFrame
        The full dataset
    xcols : list of str
        Subset of the columns to choose.
    ycols : str, list of str
        Subset of the columns for target.
    for_sklearn : bool, default=True
        Returns a np.ndarray if true, else pd.Series/DataFrame

    Returns
    -------
    _x : np.ndarray/pd.DataFrame
        Design matrix. X is reshaped ready for scikit-learn
    _y : np.ndarray/pd.Series
        Target variable
    """
    __data = preprocess_continuous_X(df, union(xcols, ycols))
    if for_sklearn:
        # returns np.ndarray objects properly configured
        _x = np.asarray(__data[xcols])
        _y = np.asarray(__data[ycols])
        if isinstance(xcols, str) or (isinstance(xcols, (list, tuple))
                                      and len(xcols) == 1):
            _x = _x.reshape(-1, 1)
        return _x, _y
    else:
        return __data[xcols], __data[ycols]
Beispiel #6
0
def _integrate_terms(a, b):
    """where a, b are packaged (term, op)"""
    t1, op = a
    t2, op2 = b
    if op == '&':
        # return a 2-tuple
        return (intersect(t1, t2), op2)
    elif op == '|':
        # return a 2-tuple
        return (union(t1, t2), op2)
    else:
        return t1
Beispiel #7
0
def _extract_coefficients_from_model(cv, x, pkg_name):
    """accepted packages: linear_model, tree, ensemble, svm."""
    if pkg_name == "sklearn.linear_model" or pkg_name == "sklearn.svm":
        cof = np.vstack([m.coef_ for m in cv["estimator"]])
        if cof.shape[-1] == 1:
            cof = cof.flatten()
        res = pd.DataFrame(cof, columns=listify(x))
        res["intercept"] = np.vstack([m.intercept_ for m in cv["estimator"]]).flatten()
        res.columns = union(listify(x), ["intercept"])
        return res
    elif pkg_name == "sklearn.tree" or pkg_name == "sklearn.ensemble":
        cof = np.vstack([m.feature_importances_ for m in cv["estimator"]])
        if cof.shape[-1] == 1:
            cof = cof.flatten()
        res = pd.DataFrame(cof, columns=listify(x))
        res.columns = pd.Index(listify(x))
        return res
    else:
        return []
Beispiel #8
0
def select(self, sc: str) -> pd.Index:
    """View a subset of columns using a flexible `eval`-like string.

    Select merely returns the columns of interest selected using this selector.
    Selections of columns can be done by:
        type [object, int, float, numpy.dtype*, pandas.CategoricalDtype]
        callable (function) that returns [bool list] of length p
        pd.Index
        str [regex, df.column name, cached name,
            meta.column name (that references a boolean column)]
        list/tuple of the above

    .. note:: We do not currently incorporate the use of brackets.

    Parameters
    ----------
    sc : str-like
        The selection string to find an optimal subset of columns.

    Warnings
    --------
    UserWarning
        If the selection returned is empty.

    Returns
    -------
    sel : pd.Index
        The list of column names NOT selected, or empty

    See Also
    --------
    view : View a selection of columns in `df_`.
    search : View the intersection of search terms, for columns in `df_`.

    Examples
    --------
    You can use string names of types to select columns of a certain type:
    >>> import turbopanda as turb
    >>> import pandas as pd
    >>> mdf = turb.MetaPanda(pd.DataFrame({'a': [1., 2.], 'b': [3, 4]}))
    >>> mdf.select("float")
    Index(['a'], dtype='object', name='colnames')

    Or inverses can also be selected using tilde `~`:
    >>> mdf.select("~float")
    Index(['b'], dtype='object', name='colnames')

    Multiple terms can be joined together, include
        regex-expressions NOT including `&` or `|`, for
        instance if we wanted to select all float columns
         containing names x1, x2 or x3:
    >>> mdf.select("float & x[1-3]")
    """
    instance_check(sc, str)

    terms = [c.strip() for c in re.split("[&|]", sc)]
    operator = re.findall("[&|]", sc)
    if len(terms) < 1:
        return pd.Index([])
    else:
        grp = [
            self.view_not(t[1:]) if t.startswith("~") else self.view(t)
            for t in terms
        ]
        full = grp[0]
        for mg, op in zip(grp[1:], operator):
            if op == "&":
                full = intersect(full, mg)
            elif op == "|":
                full = union(full, mg)
        return full
Beispiel #9
0
def _get_notes_all():
    return union(_get_notes_flat(), _get_notes_sharp())