def test_readwrite_h5ad(typ, dataset_kwargs, backing_h5ad): tmpdir = tempfile.TemporaryDirectory() tmpdirpth = Path(tmpdir.name) mid_pth = tmpdirpth / "mid.h5ad" X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not is_categorical(adata_src.obs["oanno1"]) adata_src.raw = adata_src adata_src.write(backing_h5ad, **dataset_kwargs) adata_mid = ad.read(backing_h5ad) adata_mid.write(mid_pth, **dataset_kwargs) adata = ad.read_h5ad(mid_pth) assert is_categorical(adata.obs["oanno1"]) assert not is_categorical(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert is_categorical(adata.raw.var["vanno2"]) assert np.all(adata.obs == adata_src.obs) assert np.all(adata.var == adata_src.var) assert np.all(adata.var.index == adata_src.var.index) assert adata.var.index.dtype == adata_src.var.index.dtype assert type(adata.raw.X) is type(adata_src.raw.X) assert type(adata.raw.varm) is type(adata_src.raw.varm) assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X)) assert np.all(adata.raw.var == adata_src.raw.var) assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"]) assert_equal(adata, adata_src)
def test_readwrite_dynamic(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.filename = backing_h5ad # change to backed mode adata_src.write() adata = ad.read(backing_h5ad) assert is_categorical(adata.obs['oanno1']) assert not is_categorical(adata.obs['oanno2']) assert adata.obs.index.tolist() == ['name1', 'name2', 'name3'] assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
def test_readwrite_zarr(typ, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not is_categorical(adata_src.obs['oanno1']) adata_src.write_zarr(tmp_path / 'test_zarr_dir', chunks=True) adata = ad.read_zarr(tmp_path / 'test_zarr_dir') assert is_categorical(adata.obs['oanno1']) assert not is_categorical(adata.obs['oanno2']) assert adata.obs.index.tolist() == ['name1', 'name2', 'name3'] assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
def test_readwrite_backed(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.filename = backing_h5ad # change to backed mode adata_src.write() adata = ad.read(backing_h5ad) assert is_categorical(adata.obs["oanno1"]) assert not is_categorical(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert_equal(adata, adata_src)
def test_readwrite_h5ad(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not is_categorical(adata_src.obs['oanno1']) adata_src.raw = adata_src adata_src.write(backing_h5ad) adata = ad.read(backing_h5ad) assert is_categorical(adata.obs['oanno1']) assert not is_categorical(adata.obs['oanno2']) assert adata.obs.index.tolist() == ['name1', 'name2', 'name3'] assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2'] assert is_categorical(adata.raw.var['vanno2'])
def _transform_pandas_df(data, enable_categorical, feature_names=None, feature_types=None, meta=None, meta_type=None): from pandas import MultiIndex, Int64Index from pandas.api.types import is_sparse, is_categorical data_dtypes = data.dtypes if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or (is_categorical(dtype) and enable_categorical) for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in _pandas_dtype_mapper ] msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When categorical type is supplied, DMatrix parameter `enable_categorical` must be set to `True`.""" raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None and meta is None: if isinstance(data.columns, MultiIndex): feature_names = [ ' '.join([str(x) for x in i]) for i in data.columns ] elif isinstance(data.columns, Int64Index): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() if feature_types is None and meta is None: feature_types = [] for dtype in data_dtypes: if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical(dtype) and enable_categorical: feature_types.append('categorical') else: feature_types.append(_pandas_dtype_mapper[dtype.name]) if meta and len(data.columns) > 1: raise ValueError( 'DataFrame for {meta} cannot have multiple columns'.format( meta=meta)) dtype = meta_type if meta_type else np.float32 data = np.ascontiguousarray(data.values, dtype=dtype) return data, feature_names, feature_types
def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal( s, self._timezone) elif is_categorical(s.dtype): # Note: This can be removed once minimum pyarrow version is >= 0.16.1 s = s.astype(s.dtypes.categories.dtype) try: array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck) except ValueError as e: if self._safecheck: error_msg = "Exception thrown when converting pandas.Series (%s) to " + \ "Arrow Array (%s). It can be caused by overflows or other " + \ "unsafe conversions warned by Arrow. Arrow safe type check " + \ "can be disabled by using SQL config " + \ "`spark.sql.execution.pandas.convertToArrowArraySafely`." raise ValueError(error_msg % (s.dtype, t)) from e else: raise e return array
def create_density_plots(df, density, kdims, cmap): cm = {} if density == 'all': dfs = {_sentinel: df} elif density == 'group': if 'z' not in df.columns: warnings.warn( f'`density=\'groups\' was specified, but no group found. Did you specify `color=...`?' ) dfs = {_sentinel: df} elif not is_categorical(df['z']): warnings.warn( f'`density=\'groups\' was specified, but column `{condition}` is not categorical.' ) dfs = {_sentinel: df} else: dfs = {k: v for k, v in df.groupby('z')} cm = cmap else: raise ValueError( f'Invalid `density` type: \'`{density}`\'. Possible values are `\'all\'`, `\'group\'`.' ) # assumes x, y order in kdims return [ hv.Overlay([ hv.Distribution(df, kdims=dim).opts(color=cm.get(k, 'black'), framewise=True) for k, df in dfs.items() ]) for dim in kdims ]
def _check_data(self) -> None: cat, cont = self._cat, self._cont cat_nobs = getattr(cat, "shape", (0, ))[0] cont_nobs = getattr(cont, "shape", (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, "none", nobs=self._nobs) else: raise ValueError( "nobs must be provided when cat and cont are None") return self._nobs = nobs self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, "cont", nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError("Both cat and cont are empty arrays") cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype("category") for col in cat_data}) self._cat_data = IVData(cat_data, "cat", convert_dummies=False)
def _rename_chroms(grp, rename_dict, h5opts): chroms = get(grp["chroms"]).set_index("name") n_chroms = len(chroms) new_names = np.array(chroms.rename(rename_dict).index.values, dtype=CHROM_DTYPE) # auto-adjusts char length del grp["chroms/name"] grp["chroms"].create_dataset("name", shape=(n_chroms, ), dtype=new_names.dtype, data=new_names, **h5opts) bins = get(grp["bins"]) n_bins = len(bins) idmap = dict(zip(new_names, range(n_chroms))) if is_categorical(bins["chrom"]) or is_integer(bins["chrom"]): chrom_ids = bins["chrom"].cat.codes chrom_dtype = h5py.special_dtype(enum=(CHROMID_DTYPE, idmap)) del grp["bins/chrom"] try: grp["bins"].create_dataset("chrom", shape=(n_bins, ), dtype=chrom_dtype, data=chrom_ids, **h5opts) except ValueError: # If HDF5 enum header would be too large, # try storing chrom IDs as raw int instead chrom_dtype = CHROMID_DTYPE grp["bins"].create_dataset("chrom", shape=(n_bins, ), dtype=chrom_dtype, data=chrom_ids, **h5opts)
def compute_group(cls, data, scales, **params): labels = ['x', 'y'] X = np.array(data[labels]) res = boxplot_stats(X, whis=params['coef'], labels=labels)[1] try: n = data['weight'].sum() except KeyError: n = len(data['y']) if len(np.unique(data['x'])) > 1: width = np.ptp(data['x']) * 0.9 else: width = params['width'] if pdtypes.is_categorical(data['x']): x = data['x'].iloc[0] else: x = np.mean([data['x'].min(), data['x'].max()]) d = { 'ymin': res['whislo'], 'lower': res['q1'], 'middle': [res['med']], 'upper': res['q3'], 'ymax': res['whishi'], 'outliers': [res['fliers']], 'notchupper': res['med'] + 1.58 * res['iqr'] / np.sqrt(n), 'notchlower': res['med'] - 1.58 * res['iqr'] / np.sqrt(n), 'x': x, 'width': width, 'relvarwidth': np.sqrt(n) } return pd.DataFrame(d)
def cat_concat(*args): """ Concatenate categoricals and combine the categories Parameters ---------- *args : tuple Categoricals to be concatenated Examples -------- >>> c1 = pd.Categorical(['a', 'b'], categories=['b', 'a']) >>> c2 = pd.Categorical(['d', 'a', 'c']) >>> cat_concat(c1, c2) [a, b, d, a, c] Categories (4, object): [b, a, c, d] Notes ----- The resulting category is not ordered. """ categories = pd.unique( list( chain(*(c.categories if pdtypes.is_categorical(c) else c for c in args)))) cs = pd.Categorical(list(chain(*(c for c in args))), categories=categories) return cs
def convert_columns(s: Series, drop_first: bool) -> AnyPandas: if is_string_dtype(s.dtype) and s.map(is_string_like).all(): s = s.astype("category") if is_categorical(s): out = get_dummies(s, drop_first=drop_first) out.columns = [str(s.name) + "." + str(c) for c in out] return out return s
def _fill_na_by_unique_value( strain: pd.Series, stest: Optional[pd.Series]) -> Tuple[pd.Series, pd.Series]: if is_categorical(strain): return strain.cat.codes, stest.cat.codes elif is_integer_dtype(strain.dtype): fillval = min(strain.min(), stest.min()) - 1 return strain.fillna(fillval), stest.fillna(fillval) else: return strain.astype(str), stest.astype(str)
def _find_or_check_categorical_variables(X: pd.DataFrame, variables: Variables = None ) -> List[Union[str, int]]: """ Checks that variables provided by the user are of type object or categorical. If None, finds all the categorical and object type variables in the DataFrame. Parameters ---------- X : pandas DataFrame. variables : variable or list of variables. Defaults to None. Raises ------ ValueError If there are no categorical variables in df or df is empty. TypeError If any of the user provided variables are not categorical. Returns ------- variables : List of categorical variables. """ if variables is None: # find categorical variables in dataset variables = [ column for column in X.select_dtypes(include=["O", "category"]).columns if _is_categorical_and_is_not_datetime(X[column]) ] if len(variables) == 0: raise ValueError( "No categorical variables found in this dataframe. Please check " "variable format with pandas dtypes.") elif isinstance(variables, (str, int)): if is_categorical(X[variables]) or is_object(X[variables]): variables = [variables] else: raise TypeError("The variable entered is not categorical.") else: if len(variables) == 0: raise ValueError("The list of variables is empty.") # check that user entered variables are of type categorical else: if len(X[variables].select_dtypes( exclude=["O", "category"]).columns) > 0: raise TypeError( "Some of the variables are not categorical. Please cast them as " "categorical or object before using this transformer.") return variables
def autoprep_gbdt( algorithm_type: str, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame], categorical_feature_to_treat: Optional[List[str]] = None ) -> Tuple[pd.DataFrame, pd.DataFrame]: if categorical_feature_to_treat is None: categorical_feature_to_treat = [ c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category'] ] # LightGBM: # Can handle categorical dtype. Otherwise, int, float or bool is acceptable for categorical columns. # https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support # # CatBoost: # int, float, bool or str is acceptable for categorical columns. NaN should be filled. # https://catboost.ai/docs/concepts/faq.html#why-float-and-nan-values-are-forbidden-for-cat-features # # XGBoost: # All categorical column should be encoded beforehand. if algorithm_type == 'lgbm': # LightGBM can handle categorical dtype natively categorical_feature_to_treat = [ c for c in categorical_feature_to_treat if not is_categorical(X_train[c]) ] if algorithm_type == 'cat' and len(categorical_feature_to_treat) > 0: X_train = X_train.copy() X_test = X_test.copy( ) if X_test is not None else X_train.iloc[:1, :].copy() # dummy for c in categorical_feature_to_treat: X_train[c], X_test[c] = _fill_na_by_unique_value( X_train[c], X_test[c]) if algorithm_type in ('lgbm', 'xgb') and len(categorical_feature_to_treat) > 0: assert X_test is not None, "X_test is required for XGBoost with categorical variables" X_train = X_train.copy() X_test = X_test.copy() for c in categorical_feature_to_treat: X_train[c], X_test[c] = _fill_na_by_unique_value( X_train[c], X_test[c]) le = LabelEncoder() concat = np.concatenate([X_train[c].values, X_test[c].values]) concat = le.fit_transform(concat) X_train[c] = concat[:len(X_train)] X_test[c] = concat[len(X_train):] return X_train, X_test
def df_to_h5(df, h5_anno, anno_dataset=None, anno_gp_name=None, anno_gp_dataset=None): #to array cate = {} names = ['index'] if is_string_dtype(df.index): index = df.index.values.astype(h5py.special_dtype(vlen=str)) else: index = df.index.values arrays = [index] for k in df.keys(): names.append(k) if is_string_dtype(df[k]) and not is_categorical(df[k]): arrays.append(df[k].values.astype(h5py.special_dtype(vlen=str))) elif is_categorical(df[k]): arrays.append(df[k].cat.codes) cate[k] = df[k].cat.categories else: arrays.append(df[k].values) dt = [d.dtype for d in arrays] h5_df = np.rec.fromarrays(arrays, dtype={'names': names, 'formats': dt}) # to h5 if not anno_gp_name: h5_anno_ds = h5_anno.create_dataset(anno_dataset, data=h5_df) for o in cate: h5_anno_ds.attrs[o] = cate[o].values.astype( h5py.special_dtype(vlen=str)) else: if anno_gp_name not in h5_anno.keys(): h5_anno_gp = h5_anno.create_group(anno_gp_name) else: h5_anno_gp = h5_anno[anno_gp_name] h5_anno_gp_ds = h5_anno_gp.create_dataset(anno_gp_dataset, data=h5_df) for p in cate: h5_anno_gp_ds.attrs[p] = cate[p].values.astype( h5py.special_dtype(vlen=str)) return
def test_get_with_library_large_number_of_values(self): test_obj = create_node_population( str(TEST_DATA_DIR / 'nodes_with_library_large.h5'), "default") assert test_obj.property_names == { "categorical", "string", "int", "float" } res = test_obj.get( properties=["categorical", "string", "int", "float"]) assert not is_categorical(res["categorical"]) assert res["categorical"].tolist() == ['A', 'A', 'B', 'A'] assert res["string"].tolist() == ["AA", "BB", "CC", "DD"] assert res["int"].tolist() == [0, 0, 1, 0] npt.assert_allclose(res["float"].tolist(), [0., 0., 1.1, 0.])
def cat_infreq(c, ordered=None): """ Reorder categorical by frequency of the values Parameters ---------- c : list-like Values that will make up the categorical. ordered : bool If ``True``, the categorical is ordered. Returns ------- out : categorical Values Examples -------- >>> x = ['d', 'a', 'b', 'b', 'c', 'c', 'c'] >>> cat_infreq(x) [d, a, b, b, c, c, c] Categories (4, object): [c, b, d, a] >>> cat_infreq(x, ordered=True) [d, a, b, b, c, c, c] Categories (4, object): [c < b < d < a] When two or more values occur the same number of times, if the categorical is ordered, the order is preserved. If it is not not ordered, the order depends on that of the values. Above 'd' comes before 'a', and below 'a' comes before 'a'. >>> c = pd.Categorical( ... x, categories=['a', 'c', 'b', 'd'] ... ) >>> cat_infreq(c) [d, a, b, b, c, c, c] Categories (4, object): [c, b, a, d] >>> cat_infreq(c.set_ordered(True)) [d, a, b, b, c, c, c] Categories (4, object): [c < b < a < d] """ kwargs = {} if ordered is None else {'ordered': ordered} counts = value_counts(c) if pdtypes.is_categorical(c): original_cat_order = c.categories else: original_cat_order = pd.unique(c) counts = counts.reindex(index=original_cat_order) cats = (_stable_series_sort(counts, ascending=False).index.to_list()) return pd.Categorical(c, categories=cats, **kwargs)
def _is_categorical_and_is_datetime(column: pd.Series) -> bool: # check for datetime only if object cannot be cast as numeric because # if it could pd.to_datetime would convert it to datetime regardless if is_object(column): is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt( column) # check for datetime only if the type of the categories is not numeric # because pd.to_datetime throws an error when it is an integer elif is_categorical(column): is_dt = not _is_categories_num(column) and _is_convertible_to_dt( column) return is_dt
def test_readwrite_zarr(typ, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.raw = adata_src assert not is_categorical(adata_src.obs["oanno1"]) adata_src.write_zarr(tmp_path / "test_zarr_dir", chunks=True) adata = ad.read_zarr(tmp_path / "test_zarr_dir") assert is_categorical(adata.obs["oanno1"]) assert not is_categorical(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert is_categorical(adata.raw.var["vanno2"]) assert np.all(adata.obs == adata_src.obs) assert np.all(adata.var == adata_src.var) assert np.all(adata.var.index == adata_src.var.index) assert adata.var.index.dtype == adata_src.var.index.dtype assert type(adata.raw.X) is type(adata_src.raw.X) assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X)) assert np.all(adata.raw.var == adata_src.raw.var) assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"]) assert_equal(adata, adata_src)
def plot_descriptive_graphs_for_column(self, dataframe, column, outliers_ind=None, show=False): """ :param dataframe: Input dataframe :param column: Column to plot :param outliers_ind: Boolean array for indexing outliers :param show: If to show graphs when running the code :return: Mapping of columns plotted to graphs types """ graph_type_to_graph = {} data_to_plot = dataframe[column] if is_numeric_dtype(dataframe[column]): cat = 'numerical' if isinstance(outliers_ind, np.ndarray): data_to_plot = dataframe.loc[~outliers_ind, column] elif is_categorical(dataframe[column]): cat = 'categorical' data_to_plot = dataframe[column].value_counts() elif is_datetime64_any_dtype( dataframe[column]) or is_datetime64tz_dtype(dataframe[column]): cat = 'datetime' data_to_plot = pd.Series(dates.date2num(data_to_plot)) else: self.logger.warning( '''Column "{}" could not be plotted because of {} (generic) type. Please convert it to a categorical, date or numerical type. String columns cannot be plotted!''' .format(column, dataframe[column].dtype)) return {} n_graphs = len(self.graph_per_category[cat]) fig, axes = plt.subplots(int(np.ceil(n_graphs / 2)), int(np.ceil(n_graphs / 2)), squeeze=0) fig.suptitle(column, fontsize='large') fig.tight_layout() plt.subplots_adjust(top=0.82) axes = axes.flatten() for i, graph in enumerate(self.graph_per_category[cat]): plot = data_to_plot.plot(kind=graph, ax=axes[i]) axes[i].set_title(graph) if cat == 'datetime': axes[i].xaxis.set_major_formatter(self._num_to_date) graph_type_to_graph[graph] = axes[i] if show: plt.show(block=False) return graph_type_to_graph
def label_prob(self, tup, label): ''' Give the probability of the label given tuple `tup'. ''' masked_idxs = { i for i, c in enumerate(self._df) if c == self._label_cls or not is_categorical(self._df[c]) } prior_prob = self.label_df.value_counts()[label] / len(self._df) probs = (self.probability(ft, val, label) for i, (ft, val) in enumerate(zip(self._df, tup)) if i not in masked_idxs) return prior_prob * reduce(mul, probs)
def _normalize_column(self, data, coerce_dtype=None, store_categories=True): """ Make column suitable for HDF5 storage. * numerical and boolean types map as they should * bytes colunms map to type S arrays -- they won't roundtrip * str or object columns map to type S arrays * categorical columns: * make an ENUM type (may end up being too large for HDF5 to accept) """ if coerce_dtype is not None: coerce_dtype = np.dtype(coerce_dtype) if np.isscalar(data): array = np.array([data], dtype=coerce_dtype) dtype = data.dtype fillvalue = None elif is_categorical(data): if store_categories: cats = data.cat.categories enum_dict = dict(zip(cats, range(len(cats)))) array = data.cat.codes dtype = h5py.special_dtype(enum=(array.dtype, enum_dict)) fillvalue = -1 else: array = data.cat.codes dtype = coerce_dtype or array.dtype fillvalue = -1 elif data.dtype in (object, str, bytes): data = np.asarray(data) dtype = np.dtype("S") array = np.array(data, dtype=dtype) fillvalue = None else: array = np.asarray(data) dtype = data.dtype fillvalue = None return array, dtype, fillvalue
def ordinal(arr): """ Return True if array is an ordered categorical Parameters ---------- arr : numpy.array Must have a dtype Returns ------- out : bool Whether array `arr` is an ordered categorical """ if pdtypes.is_categorical(arr): return arr.cat.ordered return False
def category_product(cats: AnyPandas) -> Series: """ Construct category from all combination of input categories Parameters ---------- cats : {Series, DataFrame} DataFrame containing categorical variables. If cats is a Series, cats is returned unmodified. Returns ------- Series Categorical series containing the cartesian product of the categories in cats """ if isinstance(cats, Series): return cats sizes = [] for c in cats: if not is_categorical(cats[c]): raise TypeError("cats must contain only categorical variables") col = cats[c] max_code = get_codes(col.cat).max() size = 1 while max_code >= 2**size: size += 1 sizes.append(size) nobs = cats.shape[0] total_size = sum(sizes) if total_size >= 63: raise ValueError( "There are too many cats with too many states to use this method.") dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64))) dtype_str = "int{0:d}".format(dtype_size) dtype_val = dtype(dtype_str) codes = zeros(nobs, dtype=dtype_val) cum_size = 0 for i, col in enumerate(cats): codes += get_codes(cats[col].cat).astype( dtype_val) << SCALAR_DTYPES[dtype_str](cum_size) cum_size += sizes[i] return Series(Categorical(codes), index=cats.index)
def column_is_categorical(self, col): ''' Check if a column in self.data is categorical or not Parameters ---------- col : str column to check Returns ------- flag : bool ''' if col not in self.data.columns: log.error('{} is not present in the data'.format(col)) raise ValueError('{} is not present in the data'.format(col)) else: return (self.data[col].dtypes == np.dtype('O')) \ or is_bool_dtype(self.data[col].dtypes)\ or is_categorical(self.data[col].dtypes)
def cat_remove_unused(c, only=None): """ Remove unused categories Parameters ---------- c : list-like Values that will make up the categorical. only : list-like (optional) The categories to remove *if* they are empty. If not given, all unused categories are dropped. Examples -------- >>> c = pd.Categorical(list('abcdd'), categories=list('bacdefg')) >>> c [a, b, c, d, d] Categories (7, object): [b, a, c, d, e, f, g] >>> cat_remove_unused(c) [a, b, c, d, d] Categories (4, object): [b, a, c, d] >>> cat_remove_unused(c, only=['a', 'e', 'g']) [a, b, c, d, d] Categories (5, object): [b, a, c, d, f] """ if not pdtypes.is_categorical(c): # All categories are used c = pd.Categorical(c) return c else: c = c.copy() if only is None: only = c.categories used_idx = pd.unique(c.codes) used_categories = c.categories[used_idx] c = c.remove_categories( c.categories.difference(used_categories).intersection(only)) return c
def from_frame(frame: DataFrame) -> "Interaction": """ Convenience function the simplifies using a DataFrame Parameters ---------- frame : DataFrame Frame containing categorical and continuous variables. All categorical variables are passed to `cat` and all other variables are passed as `cont`. Returns ------- Interaction Instance using the columns of frame Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> import pandas as pd >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n))) ... for i in range(4)],1) >>> cats.columns = ['cat{0}'.format(i) for i in range(4)] >>> columns = ['cont{0}'.format(i) for i in range(6)] >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns) >>> frame = pd.concat([cats, cont], 1) >>> interact = Interaction.from_frame(frame) >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6 (100000, 720) """ cat_cols = [col for col in frame if is_categorical(frame[col])] cont_cols = [col for col in frame if col not in cat_cols] return Interaction(frame[cat_cols], frame[cont_cols], nobs=frame.shape[0])
def cat_zip(*args, sep=':', keep_empty=False): """ Create a new categorical (zip style) combined from two or more Parameters ---------- *args : tuple Categoricals to be concatenated. sep : str (default: ':') Separator for the combined categories. keep_empty : bool (default: False) If ``True``, include all combinations of categories even those without observations. Examples -------- >>> c1 = pd.Categorical(list('aba')) >>> c2 = pd.Categorical(list('122')) >>> cat_zip(c1, c2) [a:1, b:2, a:2] Categories (3, object): [a:1, a:2, b:2] >>> cat_zip(c1, c2, keep_empty=True) [a:1, b:2, a:2] Categories (4, object): [a:1, a:2, b:1, b:2] """ values = [sep.join(items) for items in zip(*args)] cs = [c if pdtypes.is_categorical(c) else pd.Categorical(c) for c in args] categories = [ sep.join(items) for items in product(*(c.categories for c in cs)) ] c = pd.Categorical(values, categories=categories) if not keep_empty: c.remove_unused_categories(inplace=True) return c
def pandas_to_table(df): # type: (pd.DataFrame) -> Orange.data.Table """ Convert a pandas.DataFrame to a Orange.data.Table instance. """ index = df.index if not isinstance(index, pd.RangeIndex): df = df.reset_index() columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] for header, series in df.items(): # type: (Any, pd.Series) if pdtypes.is_categorical(series): coldata = series.values # type: pd.Categorical categories = [str(c) for c in coldata.categories] var = Orange.data.DiscreteVariable.make( str(header), values=categories, ordered=coldata.ordered ) # Remap the coldata into the var.values order/set coldata = pd.Categorical( coldata, categories=var.values, ordered=coldata.ordered ) codes = coldata.codes assert np.issubdtype(codes.dtype, np.integer) orangecol = np.array(codes, dtype=np.float) orangecol[codes < 0] = np.nan elif pdtypes.is_datetime64_any_dtype(series): # Check that this converts tz local to UTC series = series.astype(np.dtype("M8[ns]")) coldata = series.values # type: np.ndarray assert coldata.dtype == "M8[ns]" mask = np.isnat(coldata) orangecol = coldata.astype(np.int64) / 10 ** 9 orangecol[mask] = np.nan var = Orange.data.TimeVariable.make(str(header)) var.have_date = var.have_time = 1 elif pdtypes.is_object_dtype(series): coldata = series.values assert isinstance(coldata, np.ndarray) orangecol = coldata var = Orange.data.StringVariable.make(str(header)) elif pdtypes.is_integer_dtype(series): coldata = series.values var = Orange.data.ContinuousVariable.make(str(header)) var.number_of_decimals = 0 orangecol = coldata.astype(np.float64) elif pdtypes.is_numeric_dtype(series): orangecol = series.values.astype(np.float64) var = Orange.data.ContinuousVariable.make(str(header)) var._out_format = "%.15g" else: warnings.warn( "Column '{}' with dtype: {} skipped." .format(header, series.dtype), UserWarning ) continue columns.append((var, orangecol)) cols_x = [(var, col) for var, col in columns if var.is_primitive()] cols_m = [(var, col) for var, col in columns if not var.is_primitive()] variables = [v for v, _ in cols_x] if cols_x: X = np.column_stack([a for _, a in cols_x]) else: X = np.empty((df.shape[0], 0), dtype=np.float) metas = [v for v, _ in cols_m] if cols_m: M = np.column_stack([a for _, a in cols_m]) else: M = None domain = Orange.data.Domain(variables, metas=metas) return Orange.data.Table.from_numpy(domain, X, None, M)