Ejemplo n.º 1
0
def test_readwrite_h5ad(typ, dataset_kwargs, backing_h5ad):
    tmpdir = tempfile.TemporaryDirectory()
    tmpdirpth = Path(tmpdir.name)
    mid_pth = tmpdirpth / "mid.h5ad"

    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert not is_categorical(adata_src.obs["oanno1"])
    adata_src.raw = adata_src
    adata_src.write(backing_h5ad, **dataset_kwargs)

    adata_mid = ad.read(backing_h5ad)
    adata_mid.write(mid_pth, **dataset_kwargs)

    adata = ad.read_h5ad(mid_pth)
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert is_categorical(adata.raw.var["vanno2"])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert type(adata.raw.varm) is type(adata_src.raw.varm)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src)
Ejemplo n.º 2
0
def test_readwrite_dynamic(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.filename = backing_h5ad  # change to backed mode
    adata_src.write()

    adata = ad.read(backing_h5ad)
    assert is_categorical(adata.obs['oanno1'])
    assert not is_categorical(adata.obs['oanno2'])
    assert adata.obs.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
Ejemplo n.º 3
0
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert not is_categorical(adata_src.obs['oanno1'])
    adata_src.write_zarr(tmp_path / 'test_zarr_dir', chunks=True)

    adata = ad.read_zarr(tmp_path / 'test_zarr_dir')
    assert is_categorical(adata.obs['oanno1'])
    assert not is_categorical(adata.obs['oanno2'])
    assert adata.obs.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
Ejemplo n.º 4
0
def test_readwrite_backed(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.filename = backing_h5ad  # change to backed mode
    adata_src.write()

    adata = ad.read(backing_h5ad)
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert_equal(adata, adata_src)
Ejemplo n.º 5
0
def test_readwrite_h5ad(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert not is_categorical(adata_src.obs['oanno1'])
    adata_src.raw = adata_src
    adata_src.write(backing_h5ad)

    adata = ad.read(backing_h5ad)
    assert is_categorical(adata.obs['oanno1'])
    assert not is_categorical(adata.obs['oanno2'])
    assert adata.obs.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
    assert is_categorical(adata.raw.var['vanno2'])
def _transform_pandas_df(data,
                         enable_categorical,
                         feature_names=None,
                         feature_types=None,
                         meta=None,
                         meta_type=None):
    from pandas import MultiIndex, Int64Index
    from pandas.api.types import is_sparse, is_categorical

    data_dtypes = data.dtypes
    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
               (is_categorical(dtype) and enable_categorical)
               for dtype in data_dtypes):
        bad_fields = [
            str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
            if dtype.name not in _pandas_dtype_mapper
        ]

        msg = """DataFrame.dtypes for data must be int, float, bool or categorical.  When
                categorical type is supplied, DMatrix parameter
                `enable_categorical` must be set to `True`."""
        raise ValueError(msg + ', '.join(bad_fields))

    if feature_names is None and meta is None:
        if isinstance(data.columns, MultiIndex):
            feature_names = [
                ' '.join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, Int64Index):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()

    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data_dtypes:
            if is_sparse(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif is_categorical(dtype) and enable_categorical:
                feature_types.append('categorical')
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

    if meta and len(data.columns) > 1:
        raise ValueError(
            'DataFrame for {meta} cannot have multiple columns'.format(
                meta=meta))

    dtype = meta_type if meta_type else np.float32
    data = np.ascontiguousarray(data.values, dtype=dtype)
    return data, feature_names, feature_types
Ejemplo n.º 7
0
 def create_array(s, t):
     mask = s.isnull()
     # Ensure timestamp series are in expected form for Spark internal representation
     if t is not None and pa.types.is_timestamp(t):
         s = _check_series_convert_timestamps_internal(
             s, self._timezone)
     elif is_categorical(s.dtype):
         # Note: This can be removed once minimum pyarrow version is >= 0.16.1
         s = s.astype(s.dtypes.categories.dtype)
     try:
         array = pa.Array.from_pandas(s,
                                      mask=mask,
                                      type=t,
                                      safe=self._safecheck)
     except ValueError as e:
         if self._safecheck:
             error_msg = "Exception thrown when converting pandas.Series (%s) to " + \
                         "Arrow Array (%s). It can be caused by overflows or other " + \
                         "unsafe conversions warned by Arrow. Arrow safe type check " + \
                         "can be disabled by using SQL config " + \
                         "`spark.sql.execution.pandas.convertToArrowArraySafely`."
             raise ValueError(error_msg % (s.dtype, t)) from e
         else:
             raise e
     return array
Ejemplo n.º 8
0
 def create_density_plots(df, density, kdims, cmap):
     cm = {}
     if density == 'all':
         dfs = {_sentinel: df}
     elif density == 'group':
         if 'z' not in df.columns:
             warnings.warn(
                 f'`density=\'groups\' was specified, but no group found. Did you specify `color=...`?'
             )
             dfs = {_sentinel: df}
         elif not is_categorical(df['z']):
             warnings.warn(
                 f'`density=\'groups\' was specified, but column `{condition}` is not categorical.'
             )
             dfs = {_sentinel: df}
         else:
             dfs = {k: v for k, v in df.groupby('z')}
             cm = cmap
     else:
         raise ValueError(
             f'Invalid `density` type: \'`{density}`\'. Possible values are `\'all\'`, `\'group\'`.'
         )
     # assumes x, y order in kdims
     return [
         hv.Overlay([
             hv.Distribution(df, kdims=dim).opts(color=cm.get(k, 'black'),
                                                 framewise=True)
             for k, df in dfs.items()
         ]) for dim in kdims
     ]
Ejemplo n.º 9
0
    def _check_data(self) -> None:
        cat, cont = self._cat, self._cont
        cat_nobs = getattr(cat, "shape", (0, ))[0]
        cont_nobs = getattr(cont, "shape", (0, ))[0]
        nobs = max(cat_nobs, cont_nobs)
        if cat is None and cont is None:
            if self._nobs is not None:
                self._cont_data = self._cat_data = IVData(None,
                                                          "none",
                                                          nobs=self._nobs)
            else:
                raise ValueError(
                    "nobs must be provided when cat and cont are None")
            return
        self._nobs = nobs

        self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False)
        self._cont_data = IVData(cont,
                                 "cont",
                                 nobs=nobs,
                                 convert_dummies=False)
        if self._cat_data.shape[1] == self._cont_data.shape[1] == 0:
            raise ValueError("Both cat and cont are empty arrays")
        cat_data = self._cat_data.pandas
        convert = [
            col for col in cat_data if not (is_categorical(cat_data[col]))
        ]
        if convert:
            cat_data = DataFrame(
                {col: cat_data[col].astype("category")
                 for col in cat_data})
            self._cat_data = IVData(cat_data, "cat", convert_dummies=False)
Ejemplo n.º 10
0
def _rename_chroms(grp, rename_dict, h5opts):
    chroms = get(grp["chroms"]).set_index("name")
    n_chroms = len(chroms)
    new_names = np.array(chroms.rename(rename_dict).index.values,
                         dtype=CHROM_DTYPE)  # auto-adjusts char length

    del grp["chroms/name"]
    grp["chroms"].create_dataset("name",
                                 shape=(n_chroms, ),
                                 dtype=new_names.dtype,
                                 data=new_names,
                                 **h5opts)

    bins = get(grp["bins"])
    n_bins = len(bins)
    idmap = dict(zip(new_names, range(n_chroms)))
    if is_categorical(bins["chrom"]) or is_integer(bins["chrom"]):
        chrom_ids = bins["chrom"].cat.codes
        chrom_dtype = h5py.special_dtype(enum=(CHROMID_DTYPE, idmap))
        del grp["bins/chrom"]
        try:
            grp["bins"].create_dataset("chrom",
                                       shape=(n_bins, ),
                                       dtype=chrom_dtype,
                                       data=chrom_ids,
                                       **h5opts)
        except ValueError:
            # If HDF5 enum header would be too large,
            # try storing chrom IDs as raw int instead
            chrom_dtype = CHROMID_DTYPE
            grp["bins"].create_dataset("chrom",
                                       shape=(n_bins, ),
                                       dtype=chrom_dtype,
                                       data=chrom_ids,
                                       **h5opts)
Ejemplo n.º 11
0
    def compute_group(cls, data, scales, **params):
        labels = ['x', 'y']
        X = np.array(data[labels])
        res = boxplot_stats(X, whis=params['coef'], labels=labels)[1]
        try:
            n = data['weight'].sum()
        except KeyError:
            n = len(data['y'])

        if len(np.unique(data['x'])) > 1:
            width = np.ptp(data['x']) * 0.9
        else:
            width = params['width']

        if pdtypes.is_categorical(data['x']):
            x = data['x'].iloc[0]
        else:
            x = np.mean([data['x'].min(), data['x'].max()])

        d = {
            'ymin': res['whislo'],
            'lower': res['q1'],
            'middle': [res['med']],
            'upper': res['q3'],
            'ymax': res['whishi'],
            'outliers': [res['fliers']],
            'notchupper': res['med'] + 1.58 * res['iqr'] / np.sqrt(n),
            'notchlower': res['med'] - 1.58 * res['iqr'] / np.sqrt(n),
            'x': x,
            'width': width,
            'relvarwidth': np.sqrt(n)
        }
        return pd.DataFrame(d)
Ejemplo n.º 12
0
def cat_concat(*args):
    """
    Concatenate categoricals and combine the categories

    Parameters
    ----------
    *args : tuple
        Categoricals to be concatenated

    Examples
    --------
    >>> c1 = pd.Categorical(['a', 'b'], categories=['b', 'a'])
    >>> c2 = pd.Categorical(['d', 'a', 'c'])
    >>> cat_concat(c1, c2)
    [a, b, d, a, c]
    Categories (4, object): [b, a, c, d]

    Notes
    -----
    The resulting category is not ordered.
    """
    categories = pd.unique(
        list(
            chain(*(c.categories if pdtypes.is_categorical(c) else c
                    for c in args))))
    cs = pd.Categorical(list(chain(*(c for c in args))), categories=categories)
    return cs
Ejemplo n.º 13
0
def convert_columns(s: Series, drop_first: bool) -> AnyPandas:
    if is_string_dtype(s.dtype) and s.map(is_string_like).all():
        s = s.astype("category")

    if is_categorical(s):
        out = get_dummies(s, drop_first=drop_first)
        out.columns = [str(s.name) + "." + str(c) for c in out]
        return out
    return s
Ejemplo n.º 14
0
def _fill_na_by_unique_value(
        strain: pd.Series,
        stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]:
    if is_categorical(strain):
        return strain.cat.codes, stest.cat.codes
    elif is_integer_dtype(strain.dtype):
        fillval = min(strain.min(), stest.min()) - 1
        return strain.fillna(fillval), stest.fillna(fillval)
    else:
        return strain.astype(str), stest.astype(str)
Ejemplo n.º 15
0
def _find_or_check_categorical_variables(X: pd.DataFrame,
                                         variables: Variables = None
                                         ) -> List[Union[str, int]]:
    """
    Checks that variables provided by the user are of type object or categorical.
    If None, finds all the categorical and object type variables in the DataFrame.

    Parameters
    ----------
    X : pandas DataFrame.
    variables : variable or list of variables. Defaults to None.

    Raises
    ------
    ValueError
        If there are no categorical variables in df or df is empty.
    TypeError
        If any of the user provided variables are not categorical.

    Returns
    -------
    variables : List of categorical variables.
    """

    if variables is None:
        # find categorical variables in dataset
        variables = [
            column
            for column in X.select_dtypes(include=["O", "category"]).columns
            if _is_categorical_and_is_not_datetime(X[column])
        ]
        if len(variables) == 0:
            raise ValueError(
                "No categorical variables found in this dataframe. Please check "
                "variable format with pandas dtypes.")

    elif isinstance(variables, (str, int)):
        if is_categorical(X[variables]) or is_object(X[variables]):
            variables = [variables]
        else:
            raise TypeError("The variable entered is not categorical.")

    else:
        if len(variables) == 0:
            raise ValueError("The list of variables is empty.")

        # check that user entered variables are of type categorical
        else:
            if len(X[variables].select_dtypes(
                    exclude=["O", "category"]).columns) > 0:
                raise TypeError(
                    "Some of the variables are not categorical. Please cast them as "
                    "categorical or object before using this transformer.")

    return variables
Ejemplo n.º 16
0
def autoprep_gbdt(
    algorithm_type: str,
    X_train: pd.DataFrame,
    X_test: Optional[pd.DataFrame],
    categorical_feature_to_treat: Optional[List[str]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    if categorical_feature_to_treat is None:
        categorical_feature_to_treat = [
            c for c in X_train.columns
            if X_train[c].dtype.name in ['object', 'category']
        ]

    # LightGBM:
    # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns.
    # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support
    #
    # CatBoost:
    # int, float, bool or str is acceptable for categorical columns. NaN should be filled.
    # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features
    #
    # XGBoost:
    # All categorical column should be encoded beforehand.

    if algorithm_type == 'lgbm':
        # LightGBM can handle categorical dtype natively
        categorical_feature_to_treat = [
            c for c in categorical_feature_to_treat
            if not is_categorical(X_train[c])
        ]

    if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0:
        X_train = X_train.copy()
        X_test = X_test.copy(
        ) if X_test is not None else X_train.iloc[:1, :].copy()  # dummy
        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(
                X_train[c], X_test[c])

    if algorithm_type in ('lgbm',
                          'xgb') and len(categorical_feature_to_treat) > 0:
        assert X_test is not None, "X_test is required for XGBoost with categorical variables"
        X_train = X_train.copy()
        X_test = X_test.copy()

        for c in categorical_feature_to_treat:
            X_train[c], X_test[c] = _fill_na_by_unique_value(
                X_train[c], X_test[c])
            le = LabelEncoder()
            concat = np.concatenate([X_train[c].values, X_test[c].values])
            concat = le.fit_transform(concat)
            X_train[c] = concat[:len(X_train)]
            X_test[c] = concat[len(X_train):]

    return X_train, X_test
Ejemplo n.º 17
0
def df_to_h5(df,
             h5_anno,
             anno_dataset=None,
             anno_gp_name=None,
             anno_gp_dataset=None):
    #to array
    cate = {}
    names = ['index']
    if is_string_dtype(df.index):
        index = df.index.values.astype(h5py.special_dtype(vlen=str))
    else:
        index = df.index.values
    arrays = [index]
    for k in df.keys():
        names.append(k)
        if is_string_dtype(df[k]) and not is_categorical(df[k]):
            arrays.append(df[k].values.astype(h5py.special_dtype(vlen=str)))
        elif is_categorical(df[k]):
            arrays.append(df[k].cat.codes)
            cate[k] = df[k].cat.categories
        else:
            arrays.append(df[k].values)
    dt = [d.dtype for d in arrays]
    h5_df = np.rec.fromarrays(arrays, dtype={'names': names, 'formats': dt})
    # to h5
    if not anno_gp_name:
        h5_anno_ds = h5_anno.create_dataset(anno_dataset, data=h5_df)
        for o in cate:
            h5_anno_ds.attrs[o] = cate[o].values.astype(
                h5py.special_dtype(vlen=str))
    else:
        if anno_gp_name not in h5_anno.keys():
            h5_anno_gp = h5_anno.create_group(anno_gp_name)
        else:
            h5_anno_gp = h5_anno[anno_gp_name]
        h5_anno_gp_ds = h5_anno_gp.create_dataset(anno_gp_dataset, data=h5_df)
        for p in cate:
            h5_anno_gp_ds.attrs[p] = cate[p].values.astype(
                h5py.special_dtype(vlen=str))
    return
Ejemplo n.º 18
0
 def test_get_with_library_large_number_of_values(self):
     test_obj = create_node_population(
         str(TEST_DATA_DIR / 'nodes_with_library_large.h5'), "default")
     assert test_obj.property_names == {
         "categorical", "string", "int", "float"
     }
     res = test_obj.get(
         properties=["categorical", "string", "int", "float"])
     assert not is_categorical(res["categorical"])
     assert res["categorical"].tolist() == ['A', 'A', 'B', 'A']
     assert res["string"].tolist() == ["AA", "BB", "CC", "DD"]
     assert res["int"].tolist() == [0, 0, 1, 0]
     npt.assert_allclose(res["float"].tolist(), [0., 0., 1.1, 0.])
Ejemplo n.º 19
0
def cat_infreq(c, ordered=None):
    """
    Reorder categorical by frequency of the values

    Parameters
    ----------
    c : list-like
        Values that will make up the categorical.
    ordered : bool
        If ``True``, the categorical is ordered.

    Returns
    -------
    out : categorical
        Values

    Examples
    --------
    >>> x = ['d', 'a', 'b', 'b', 'c', 'c', 'c']
    >>> cat_infreq(x)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c, b, d, a]
    >>> cat_infreq(x, ordered=True)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c < b < d < a]

    When two or more values occur the same number of times, if the
    categorical is ordered, the order is preserved. If it is not
    not ordered, the order depends on that of the values. Above 'd'
    comes before 'a', and below 'a' comes before 'a'.

    >>> c = pd.Categorical(
    ...     x, categories=['a', 'c', 'b', 'd']
    ... )
    >>> cat_infreq(c)
    [d, a, b, b, c, c, c]
    Categories (4, object): [c, b, a, d]
    >>> cat_infreq(c.set_ordered(True))
    [d, a, b, b, c, c, c]
    Categories (4, object): [c < b < a < d]
    """
    kwargs = {} if ordered is None else {'ordered': ordered}
    counts = value_counts(c)
    if pdtypes.is_categorical(c):
        original_cat_order = c.categories
    else:
        original_cat_order = pd.unique(c)
    counts = counts.reindex(index=original_cat_order)
    cats = (_stable_series_sort(counts, ascending=False).index.to_list())
    return pd.Categorical(c, categories=cats, **kwargs)
Ejemplo n.º 20
0
def _is_categorical_and_is_datetime(column: pd.Series) -> bool:

    # check for datetime only if object cannot be cast as numeric because
    # if it could pd.to_datetime would convert it to datetime regardless
    if is_object(column):
        is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(
            column)

    # check for datetime only if the type of the categories is not numeric
    # because pd.to_datetime throws an error when it is an integer
    elif is_categorical(column):
        is_dt = not _is_categories_num(column) and _is_convertible_to_dt(
            column)

    return is_dt
Ejemplo n.º 21
0
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.raw = adata_src
    assert not is_categorical(adata_src.obs["oanno1"])
    adata_src.write_zarr(tmp_path / "test_zarr_dir", chunks=True)

    adata = ad.read_zarr(tmp_path / "test_zarr_dir")
    assert is_categorical(adata.obs["oanno1"])
    assert not is_categorical(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert is_categorical(adata.raw.var["vanno2"])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src)
Ejemplo n.º 22
0
 def plot_descriptive_graphs_for_column(self,
                                        dataframe,
                                        column,
                                        outliers_ind=None,
                                        show=False):
     """
     :param dataframe: Input dataframe
     :param column: Column to plot
     :param outliers_ind: Boolean array for indexing outliers
     :param show: If to show graphs when running the code
     :return: Mapping of columns plotted to graphs types
     """
     graph_type_to_graph = {}
     data_to_plot = dataframe[column]
     if is_numeric_dtype(dataframe[column]):
         cat = 'numerical'
         if isinstance(outliers_ind, np.ndarray):
             data_to_plot = dataframe.loc[~outliers_ind, column]
     elif is_categorical(dataframe[column]):
         cat = 'categorical'
         data_to_plot = dataframe[column].value_counts()
     elif is_datetime64_any_dtype(
             dataframe[column]) or is_datetime64tz_dtype(dataframe[column]):
         cat = 'datetime'
         data_to_plot = pd.Series(dates.date2num(data_to_plot))
     else:
         self.logger.warning(
             '''Column "{}" could not be plotted because of {} (generic) type.
 Please convert it to a categorical, date or numerical type. String columns cannot be plotted!'''
             .format(column, dataframe[column].dtype))
         return {}
     n_graphs = len(self.graph_per_category[cat])
     fig, axes = plt.subplots(int(np.ceil(n_graphs / 2)),
                              int(np.ceil(n_graphs / 2)),
                              squeeze=0)
     fig.suptitle(column, fontsize='large')
     fig.tight_layout()
     plt.subplots_adjust(top=0.82)
     axes = axes.flatten()
     for i, graph in enumerate(self.graph_per_category[cat]):
         plot = data_to_plot.plot(kind=graph, ax=axes[i])
         axes[i].set_title(graph)
         if cat == 'datetime':
             axes[i].xaxis.set_major_formatter(self._num_to_date)
         graph_type_to_graph[graph] = axes[i]
     if show:
         plt.show(block=False)
     return graph_type_to_graph
Ejemplo n.º 23
0
    def label_prob(self, tup, label):
        '''
        Give the probability of the label given tuple `tup'.
        '''
        masked_idxs = {
            i
            for i, c in enumerate(self._df)
            if c == self._label_cls or not is_categorical(self._df[c])
        }

        prior_prob = self.label_df.value_counts()[label] / len(self._df)

        probs = (self.probability(ft, val, label)
                 for i, (ft, val) in enumerate(zip(self._df, tup))
                 if i not in masked_idxs)

        return prior_prob * reduce(mul, probs)
Ejemplo n.º 24
0
    def _normalize_column(self,
                          data,
                          coerce_dtype=None,
                          store_categories=True):
        """
        Make column suitable for HDF5 storage.

        * numerical and boolean types map as they should
        * bytes colunms map to type S arrays -- they won't roundtrip
        * str or object columns map to type S arrays
        * categorical columns:
            * make an ENUM type (may end up being too large for HDF5 to accept)

        """
        if coerce_dtype is not None:
            coerce_dtype = np.dtype(coerce_dtype)

        if np.isscalar(data):
            array = np.array([data], dtype=coerce_dtype)
            dtype = data.dtype
            fillvalue = None

        elif is_categorical(data):
            if store_categories:
                cats = data.cat.categories
                enum_dict = dict(zip(cats, range(len(cats))))
                array = data.cat.codes
                dtype = h5py.special_dtype(enum=(array.dtype, enum_dict))
                fillvalue = -1
            else:
                array = data.cat.codes
                dtype = coerce_dtype or array.dtype
                fillvalue = -1

        elif data.dtype in (object, str, bytes):
            data = np.asarray(data)
            dtype = np.dtype("S")
            array = np.array(data, dtype=dtype)
            fillvalue = None

        else:
            array = np.asarray(data)
            dtype = data.dtype
            fillvalue = None

        return array, dtype, fillvalue
Ejemplo n.º 25
0
    def ordinal(arr):
        """
        Return True if array is an ordered categorical

        Parameters
        ----------
        arr : numpy.array
            Must have a dtype

        Returns
        -------
        out : bool
            Whether array `arr` is an ordered categorical
        """
        if pdtypes.is_categorical(arr):
            return arr.cat.ordered
        return False
Ejemplo n.º 26
0
def category_product(cats: AnyPandas) -> Series:
    """
    Construct category from all combination of input categories

    Parameters
    ----------
    cats : {Series, DataFrame}
        DataFrame containing categorical variables.  If cats is a Series, cats
        is returned unmodified.

    Returns
    -------
    Series
        Categorical series containing the cartesian product of the categories
        in cats
    """
    if isinstance(cats, Series):
        return cats

    sizes = []
    for c in cats:
        if not is_categorical(cats[c]):
            raise TypeError("cats must contain only categorical variables")
        col = cats[c]
        max_code = get_codes(col.cat).max()
        size = 1
        while max_code >= 2**size:
            size += 1
        sizes.append(size)
    nobs = cats.shape[0]
    total_size = sum(sizes)
    if total_size >= 63:
        raise ValueError(
            "There are too many cats with too many states to use this method.")
    dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64)))
    dtype_str = "int{0:d}".format(dtype_size)
    dtype_val = dtype(dtype_str)
    codes = zeros(nobs, dtype=dtype_val)
    cum_size = 0
    for i, col in enumerate(cats):
        codes += get_codes(cats[col].cat).astype(
            dtype_val) << SCALAR_DTYPES[dtype_str](cum_size)
        cum_size += sizes[i]
    return Series(Categorical(codes), index=cats.index)
Ejemplo n.º 27
0
    def column_is_categorical(self, col):
        '''
        Check if a column in self.data is categorical or not

        Parameters
        ----------
        col : str
            column to check

        Returns
        -------
        flag : bool

        '''
        if col not in self.data.columns:
            log.error('{} is not present in the data'.format(col))
            raise ValueError('{} is not present in the data'.format(col))
        else:
            return (self.data[col].dtypes == np.dtype('O')) \
                   or is_bool_dtype(self.data[col].dtypes)\
                   or is_categorical(self.data[col].dtypes)
Ejemplo n.º 28
0
def cat_remove_unused(c, only=None):
    """
    Remove unused categories

    Parameters
    ----------
    c : list-like
        Values that will make up the categorical.
    only : list-like (optional)
        The categories to remove *if* they are empty. If not given,
        all unused categories are dropped.

    Examples
    --------
    >>> c = pd.Categorical(list('abcdd'), categories=list('bacdefg'))
    >>> c
    [a, b, c, d, d]
    Categories (7, object): [b, a, c, d, e, f, g]
    >>> cat_remove_unused(c)
    [a, b, c, d, d]
    Categories (4, object): [b, a, c, d]
    >>> cat_remove_unused(c, only=['a', 'e', 'g'])
    [a, b, c, d, d]
    Categories (5, object): [b, a, c, d, f]
    """
    if not pdtypes.is_categorical(c):
        # All categories are used
        c = pd.Categorical(c)
        return c
    else:
        c = c.copy()

    if only is None:
        only = c.categories

    used_idx = pd.unique(c.codes)
    used_categories = c.categories[used_idx]
    c = c.remove_categories(
        c.categories.difference(used_categories).intersection(only))
    return c
Ejemplo n.º 29
0
    def from_frame(frame: DataFrame) -> "Interaction":
        """
        Convenience function the simplifies using a DataFrame

        Parameters
        ----------
        frame : DataFrame
            Frame containing categorical and continuous variables. All
            categorical variables are passed to `cat` and all other
            variables are passed as `cont`.

        Returns
        -------
        Interaction
            Instance using the columns of frame

        Examples
        --------
        >>> import numpy as np
        >>> from linearmodels.iv.absorbing import Interaction
        >>> import pandas as pd
        >>> rs = np.random.RandomState(0)
        >>> n = 100000
        >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n)))
        ...                  for i in range(4)],1)
        >>> cats.columns = ['cat{0}'.format(i) for i in range(4)]
        >>> columns = ['cont{0}'.format(i) for i in range(6)]
        >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns)
        >>> frame = pd.concat([cats, cont], 1)
        >>> interact = Interaction.from_frame(frame)
        >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6
        (100000, 720)
        """
        cat_cols = [col for col in frame if is_categorical(frame[col])]
        cont_cols = [col for col in frame if col not in cat_cols]
        return Interaction(frame[cat_cols],
                           frame[cont_cols],
                           nobs=frame.shape[0])
Ejemplo n.º 30
0
def cat_zip(*args, sep=':', keep_empty=False):
    """
    Create a new categorical (zip style) combined from two or more

    Parameters
    ----------
    *args : tuple
        Categoricals to be concatenated.
    sep : str (default: ':')
        Separator for the combined categories.
    keep_empty : bool (default: False)
        If ``True``, include all combinations of categories
        even those without observations.

    Examples
    --------
    >>> c1 = pd.Categorical(list('aba'))
    >>> c2 = pd.Categorical(list('122'))
    >>> cat_zip(c1, c2)
    [a:1, b:2, a:2]
    Categories (3, object): [a:1, a:2, b:2]
    >>> cat_zip(c1, c2, keep_empty=True)
    [a:1, b:2, a:2]
    Categories (4, object): [a:1, a:2, b:1, b:2]
    """
    values = [sep.join(items) for items in zip(*args)]
    cs = [c if pdtypes.is_categorical(c) else pd.Categorical(c) for c in args]
    categories = [
        sep.join(items) for items in product(*(c.categories for c in cs))
    ]

    c = pd.Categorical(values, categories=categories)

    if not keep_empty:
        c.remove_unused_categories(inplace=True)

    return c
def pandas_to_table(df):
    # type: (pd.DataFrame) -> Orange.data.Table
    """
    Convert a pandas.DataFrame to a Orange.data.Table instance.
    """
    index = df.index
    if not isinstance(index, pd.RangeIndex):
        df = df.reset_index()

    columns = []  # type: List[Tuple[Orange.data.Variable, np.ndarray]]

    for header, series in df.items():  # type: (Any, pd.Series)
        if pdtypes.is_categorical(series):
            coldata = series.values  # type: pd.Categorical
            categories = [str(c) for c in coldata.categories]
            var = Orange.data.DiscreteVariable.make(
                str(header), values=categories, ordered=coldata.ordered
            )
            # Remap the coldata into the var.values order/set
            coldata = pd.Categorical(
                coldata, categories=var.values, ordered=coldata.ordered
            )
            codes = coldata.codes
            assert np.issubdtype(codes.dtype, np.integer)
            orangecol = np.array(codes, dtype=np.float)
            orangecol[codes < 0] = np.nan
        elif pdtypes.is_datetime64_any_dtype(series):
            # Check that this converts tz local to UTC
            series = series.astype(np.dtype("M8[ns]"))
            coldata = series.values  # type: np.ndarray
            assert coldata.dtype == "M8[ns]"
            mask = np.isnat(coldata)
            orangecol = coldata.astype(np.int64) / 10 ** 9
            orangecol[mask] = np.nan
            var = Orange.data.TimeVariable.make(str(header))
            var.have_date = var.have_time = 1
        elif pdtypes.is_object_dtype(series):
            coldata = series.values
            assert isinstance(coldata, np.ndarray)
            orangecol = coldata
            var = Orange.data.StringVariable.make(str(header))
        elif pdtypes.is_integer_dtype(series):
            coldata = series.values
            var = Orange.data.ContinuousVariable.make(str(header))
            var.number_of_decimals = 0
            orangecol = coldata.astype(np.float64)
        elif pdtypes.is_numeric_dtype(series):
            orangecol = series.values.astype(np.float64)
            var = Orange.data.ContinuousVariable.make(str(header))
            var._out_format = "%.15g"
        else:
            warnings.warn(
                "Column '{}' with dtype: {} skipped."
                .format(header, series.dtype),
                UserWarning
            )
            continue
        columns.append((var, orangecol))

    cols_x = [(var, col) for var, col in columns if var.is_primitive()]
    cols_m = [(var, col) for var, col in columns if not var.is_primitive()]

    variables = [v for v, _ in cols_x]
    if cols_x:
        X = np.column_stack([a for _, a in cols_x])
    else:
        X = np.empty((df.shape[0], 0), dtype=np.float)
    metas = [v for v, _ in cols_m]
    if cols_m:
        M = np.column_stack([a for _, a in cols_m])
    else:
        M = None

    domain = Orange.data.Domain(variables, metas=metas)
    return Orange.data.Table.from_numpy(domain, X, None, M)