def test_data_characters_types(): from pandas.api.types import is_object_dtype from pandas.api.types import is_float_dtype las = lasio.read(egfn('data_characters.las')) assert is_object_dtype(las.df().index.dtype) assert is_object_dtype(las.df()['DATE'].dtype) assert is_float_dtype(las.df()['DEPT'].dtype) assert is_float_dtype(las.df()['ARC_GR_UNC_RT'].dtype)
def _write_header(data, fp, relation_name, index): """Write header containing attribute names and types""" fp.write("@relation {0}\n\n".format(relation_name)) if index: data = data.reset_index() attribute_names = _sanitize_column_names(data) for column, series in data.iteritems(): name = attribute_names[column] fp.write("@attribute {0}\t".format(name)) if is_categorical_dtype(series) or is_object_dtype(series): _write_attribute_categorical(series, fp) elif numpy.issubdtype(series.dtype, numpy.floating): fp.write("real") elif numpy.issubdtype(series.dtype, numpy.integer): fp.write("integer") elif numpy.issubdtype(series.dtype, numpy.datetime64): fp.write("date 'yyyy-MM-dd HH:mm:ss'") else: raise TypeError('unsupported type %s' % series.dtype) fp.write("\n") return data
def _checkColumnTypes(df, cols, column_mapping): """ Checks that each dataframe column listed in cols has Pandas dtype "Object". Parameters ---------- df : `~pandas.DataFrame` Pandas dataframe cols : list Columns to check for appropriate data type. column_mapping : dict Column name mapping to internally used column names (truth, linkage_id, obs_id). Raises ------ TypeError : If any column is not of type "Object" or String . Returns ------- None """ error_text = "" for col in cols: value = column_mapping[col] if not is_object_dtype(df[value].dtype): error = "\n{1} column ('{0}') should have type string. " \ "Please convert column using: \n" \ "dataframe['{0}'] = dataframe['{0}'].astype(str)`\n" error = error.format(value, col) error_text += error if len(error_text) > 0: raise TypeError(error_text) return
def astype(self, dtype, copy=True): """Cast to a NumPy array with 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- array : ndarray NumPy ndarray with 'dtype' for its dtype. """ if isinstance(dtype, str) and (dtype.startswith("Pint[") or dtype.startswith("pint[")): dtype = PintType(dtype) if isinstance(dtype, PintType): if dtype == self._dtype and not copy: return self else: return PintArray( self.quantity.to(dtype.units).magnitude, dtype) # do *not* delegate to __array__ -> is required to return a numpy array, # but somebody may be requesting another pandas array # examples are e.g. PyArrow arrays as requested by "string[pyarrow]" if is_object_dtype(dtype): return self._to_array_of_quantity(copy=copy) if is_string_dtype(dtype): return pd.array([str(x) for x in self.quantity], dtype=dtype) return pd.array(self.quantity, dtype, copy)
def find_atoms(self, data: pd.DataFrame): """ Find the numeric atoms and categorical levels to be modeled. """ self._dtypes = data.dtypes atoms_dict = {} levels_dict = {} for i in range(data.shape[1]): vname = data.columns[i] dt = self._dtypes[i] if is_numeric_dtype(dt): variable = data.iloc[:, i] counts = variable.value_counts().sort_values(ascending=False) number_observed = counts.sum() atom_indicator = counts > 0.05 * number_observed atoms = counts[atom_indicator].index.tolist() if len(atoms) > 3: atoms = atoms[:3] atoms_dict[vname] = atoms self._numeric_colnames.append(data.columns[i]) elif (is_categorical_dtype(dt) or is_object_dtype(dt) or is_bool_dtype(dt)): # TODO: put in some cardinality protections. levels = data.iloc[:, i].value_counts() levels_dict[vname] = levels self._categorical_colnames.append(data.columns[i]) else: raise Exception( "Only categorical or numeric types are supported.") return atoms_dict, levels_dict
def to_data_table(data: pd.DataFrame): """ Create a BOOM DataTable object from a pandas DataFrame. The categories of any categorical variables will be handled as strings. """ dtypes = data.dtypes ans = boom.DataTable() for i in range(data.shape[1]): dt = dtypes[i] vname = data.columns[i] if is_numeric_dtype(dt) or is_bool_dtype(dt): ans.add_numeric( boom.Vector(data.iloc[:, i].values.astype("float")), vname) elif is_categorical_dtype(dt): x = data.iloc[:, i] values = x.cat.codes codes = x.cat.categories ans.add_categorical(values, codes, vname) elif is_object_dtype(dt): labels = data.iloc[:, i].astype("str") ans.add_categorical_from_labels(labels.values, vname) else: raise Exception( f"Only numeric or categorical data are supported. " f"Column {i} ({data.columns[i]}) has dtype {dt}.") return ans
def _sanitize_anndata(adata: AnnData) -> None: """Sanitization and sanity checks on IR-anndata object. Should be executed by every read_xxx function""" assert (len(adata.X.shape) == 2 ), "X needs to have dimensions, otherwise concat doesn't work. " # Pending updates to anndata to properly handle boolean columns. # For now, let's turn them into a categorical with "True/False" BOOLEAN_COLS = ("has_ir", "is_cell", "multi_chain", "high_confidence", "productive") # explicitly convert those to categoricals. All IR_ columns that are strings # will be converted to categoricals, too CATEGORICAL_COLS = ("extra_chains", ) # Sanitize has_ir column into categorical # This should always be a categorical with True / False for col in adata.obs.columns: if col.endswith(BOOLEAN_COLS): adata.obs[col] = pd.Categorical( [ "True" if _is_true2(x) else "False" if _is_false2(x) else "None" for x in adata.obs[col] ], categories=["True", "False", "None"], ) elif col.endswith(CATEGORICAL_COLS) or ( col.startswith("IR_") and is_object_dtype(adata.obs[col])): # Turn all IR_VJ columns that are of type string or object to categoricals # otherwise saving anndata doesn't work. adata.obs[col] = pd.Categorical(adata.obs[col]) adata.strings_to_categoricals()
def _check_Xy(X: pd.DataFrame, y: pd.Series, *, norm_y=False) -> Tuple[pd.Series, pd.Series]: if np.ndim(X) == 1: X = pd.Series(X).to_frame() elif np.ndim(X) == 2: X = pd.DataFrame(X) assert X.ndim == 2 assert np.ndim(y) == 1 assert len(X) == len(y) valid = ~X.isnull().any(1).values X = pd.Series(list(zip(*X.values[valid].T)), name=tuple(X.columns)).astype('category') y = pd.Series(y).reset_index(drop=True)[valid] if is_object_dtype(y): y = pd.Categorical(y) if norm_y: assert is_numeric_dtype(y) y = (y - y.mean()) / y.std() return X, y
def get_object_dtypes(self, dtypes_validated: TYPE_DSTR) -> TYPE_DSTR: """Inspect all columns of dtype object and ensure no mixed dtypes are present. Raises type error otherwise. Ignores columns for which dtypes are already explicitly set. Parameters ---------- dtypes_validated: dict Represents already given column/dtype pairs. Keys refer to column names and values represent dtypes. Returns ------- dtypes_object: dict Keys refer to column names and values represent dtypes. """ dtypes_object = {} for column in self.df.columns: if column in dtypes_validated: continue if pd_types.is_object_dtype(self.df[column]): dtypes_object[column] = self.inspect_dtype_object(column) return dtypes_object
def series_is_boolean(col: pd.Series or pd.Index): """ returns: None if column is all None; True if a pd.Series only contains True, False, and None; False otherwise caveat: does not interpret all-zero or all-one columns as boolean""" if len(col.unique()) == 1 and col.unique()[0] is None: # return None for all-None columns return None elif col.isna().all(): return None elif is_bool_dtype(col): return True elif is_object_dtype(col): for val in col.unique(): if val not in [True, False, None]: return False if not (False in col.unique() and True in col.unique()): return False return True elif is_integer_dtype(col) or is_float_dtype(col): for val in col.unique(): if pd.isna(val): continue if val not in [1, 0, None]: return False if not (0 in col.unique() and 1 in col.unique()): return False return True return False
def test_upload_pandas_categorical_ipc(self, con): con.execute("DROP TABLE IF EXISTS test_categorical;") df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) df["B"] = df["A"].astype('category') # test that table created correctly when it doesn't exist on server con.load_table("test_categorical", df) ans = con.execute("select * from test_categorical").fetchall() assert ans == [('a', 'a'), ('b', 'b'), ('c', 'c'), ('a', 'a')] assert con.get_table_details("test_categorical") == [ ColumnDetails( name='A', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ColumnDetails( name='B', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT', is_array=False, ), ] # load row-wise con.load_table("test_categorical", df, method="rows") # load columnar con.load_table("test_categorical", df, method="columnar") # load arrow con.load_table("test_categorical", df, method="arrow") # test end result df_ipc = con.select_ipc("select * from test_categorical") assert df_ipc.shape == (16, 2) res = df.append([df, df, df]).reset_index(drop=True) res["A"] = res["A"].astype('category') res["B"] = res["B"].astype('category') assert pd.DataFrame.equals(df_ipc, res) # test that input df wasn't mutated # original input is object, categorical # to load via Arrow, converted internally to object, object assert is_object_dtype(df["A"]) assert is_categorical_dtype(df["B"]) con.execute("DROP TABLE IF EXISTS test_categorical;")
def df_string_to_cat(df:pd.DataFrame) -> dict: catencoders = {} for colname in df.columns: if is_string_dtype(df[colname]) or is_object_dtype(df[colname]): df[colname] = df[colname].astype('category').cat.as_ordered() catencoders[colname] = df[colname].cat.categories return catencoders
def string_contains(series: pd.Series, state: dict) -> bool: if pdt.is_categorical_dtype(series): return False elif not pdt.is_object_dtype(series): return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) return _is_string(series, state)
def explode(df): """ Based on this answer: https://stackoverflow.com/questions/12680754/split-explode-pandas\ -dataframe-string-entry-to-separate-rows/40449726#40449726 """ if df is None or df.empty: return df # get the list columns lst_cols = [col for col, dtype in df.dtypes.items() if is_object_dtype(dtype)] # Be more specific about which objects are ok lst_cols = [col for col in lst_cols if isinstance(df[col].iloc[0], _explodable_types)] if not lst_cols: return df # all columns except `lst_cols` idx_cols = df.columns.difference(lst_cols) # check all lists have same length lens = pd.DataFrame({col: df[col].str.len() for col in lst_cols}) different_length = (lens.nunique(axis=1) > 1).any() if different_length: raise ValueError("Cannot bin multiple arrays with different jaggedness") lens = lens[lst_cols[0]] # create "exploded" DF flattened = {col: df.loc[lens > 0, col].values for col in lst_cols} flattened = {col: sum(map(list, vals), []) for col, vals in flattened.items()} res = pd.DataFrame({col: np.repeat(df[col].values, lens) for col in idx_cols}) res = res.assign(**flattened) # Check that rows are fully "exploded" return explode(res)
def analyze_cat(df, save_pic=True, visual=True, path='') -> None: ''' :Description: This function plot normalized frequency values of the variable in a bar chart for frequency larger than 25%. :param df: The data to be investigated. :type df: pandas data frame :param save_pic: if the user wants to save the results. default is True to save the results as a png file. :type save_pic: bool :param path: the path to save the plot in. :type path: str :return: None, a plot. ''' if visual: print( "******************Plotting for non-numeric variables*********************" ) obj_cols = [ cols for cols in df.columns if is_object_dtype(df[cols]) and len(df[cols].dropna()) > 0 ] print('These are non numeric columns\n', obj_cols) # For each object column in the list for x, col_name in enumerate(obj_cols): # print(x + 1, " of ", iter_len, " completed ", col_name) values_freq_threshold = 25 # If unique values count is below the threshold value then store the details of unique values # normalize True/False for counts col_unique_vals = df[col_name].value_counts(normalize=True, sort=True) #generating a data frame for the normalized count value data f = pd.DataFrame(np.array( col_unique_vals.head(values_freq_threshold).reset_index()), columns=['Values', 'Count']) # Plot the graphs fig, ax = plt.subplots(figsize=(17, 9), constrained_layout=True) fig.suptitle("Profile of column " + str(col_name).strip(), fontsize=25) ax.bar(f.Values, f.Count, color=perc_color[1]) ax.set_title("Normalized bar chart for top 25 values") plt.xticks(rotation=90) ax.set_ylabel('Count') ax.set_xlabel('Values') for p in ax.patches: ax.annotate(str(round(p.get_height(), 2)), (p.get_x(), p.get_height() * 1.01)) fig_name = path + 'EDA_' + str(col_name).strip() + '.png' if save_pic: fig.savefig(fig_name, dpi=100) plt.show() plt.close(fig) else: print('****************Nothing will be plotted******************') pass
def fillna(data): data = data.drop(columns=['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体']) if is_object_dtype(data['性别']): data['性别'] = data['性别'].map({'男':0, '女':1}) feature_col = [column for column in data.columns if column not in ['id', '体检日期', '血糖']] # feature_min = data[feature_col].min() # feature_max = data[feature_col].max() # scaled_feature = (data[feature_col] - feature_min) / (feature_max - feature_min) # data.loc[:, feature_col] = scaled_feature.values columns_na = data.columns[data.isna().sum() > 0] complete_sample = data.loc[data.isna().sum(axis=1) == 0, :] incomplete_sample = data.loc[data.isna().sum(axis=1) > 0, :] params = { 'objective': 'regression', 'boosting': 'rf', 'learning_rate': 0.01, 'num_leaves': 15, 'num_threads': multiprocessing.cpu_count() // 2, 'min_data_in_leaf': 50, 'min_sum_hessian_in_leaf': 1e-2, 'feature_fraction': 0.7, 'feature_fraction_seed': 2018, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'bagging_seed': 2018, 'tree_learner': 'feature', 'verbose': -1, 'metric': 'mse', } kf = KFold(n_splits=5, shuffle=True, random_state=2018) for target in columns_na: X = complete_sample.loc[:, [column for column in feature_col if column is not target]] y = complete_sample.loc[:, target] na_sample_idxer = incomplete_sample[target].isna() XTest = incomplete_sample.loc[na_sample_idxer, feature_col].values result_to_fill = np.zeros((XTest.shape[0], 5)) for cv_idx, (train_idx, valid_idx) in enumerate(kf.split(X)): train_set = lgb.Dataset(X.iloc[train_idx], label=y.iloc[train_idx]) valid_set = lgb.Dataset(X.iloc[valid_idx], label=y.iloc[valid_idx]) gbm = lgb.train(params, train_set, num_boost_round=3000, categorical_feature=['性别'], valid_sets=valid_set, valid_names='valid', early_stopping_rounds=100, verbose_eval=False) result_to_fill[:, cv_idx] = gbm.predict(XTest, num_iteration=gbm.best_iteration) incomplete_sample.loc[na_sample_idxer, target] = result_to_fill.mean(axis=1) data = pd.concat([complete_sample, incomplete_sample]) # inverse_values = data[feature_col]*(feature_max - feature_min) + feature_min # data.loc[:, feature_col] = inverse_values return data
def to_pandas_time_index( time: Union[pint.Quantity, np.ndarray, pd.TimedeltaIndex, pd.DatetimeIndex, xr.DataArray, "tf.LocalCoordinateSystem", ], ) -> Union[pd.TimedeltaIndex, pd.DatetimeIndex]: """Convert a time variable to the corresponding pandas time index type. Parameters ---------- time : Variable that should be converted. Returns ------- Union[pandas.TimedeltaIndex, pandas.DatetimeIndex] : Time union of all input objects """ from weldx.transformations import LocalCoordinateSystem _input_type = type(time) if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)): return time if isinstance(time, LocalCoordinateSystem): return to_pandas_time_index(time.time) if isinstance(time, pint.Quantity): base = "s" # using low base unit could cause rounding errors if not np.iterable(time): # catch zero-dim arrays time = np.expand_dims(time, 0) return pd.TimedeltaIndex(data=time.to(base).magnitude, unit=base) if isinstance(time, (xr.DataArray, xr.Dataset)): if "time" in time.coords: time = time.time time_index = pd.Index(time.values) if is_timedelta64_dtype(time_index) and time.weldx.time_ref: time_index = time_index + time.weldx.time_ref return time_index if not np.iterable(time) or isinstance(time, str): time = [time] time = pd.Index(time) if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)): return time # try manual casting for object dtypes (i.e. strings), should avoid integers # warning: this allows something like ["1","2","3"] which will be ns !! if is_object_dtype(time): for func in (pd.DatetimeIndex, pd.TimedeltaIndex): try: return func(time) except (ValueError, TypeError): continue raise TypeError(f"Could not convert {_input_type} " f"to pd.DatetimeIndex or pd.TimedeltaIndex")
def generate_missing_value_indicator(df: pd.DataFrame, columns: list, fill_value="NA"): """Fill any na values in columns with fill_value.""" for column in columns: if not is_object_dtype(df[column]): print("skipping non-object column {}".format(column)) else: df.loc[:, column] = df[column].fillna(fill_value) return df
def contains_op(cls, series: pd.Series, state: dict) -> bool: if pdt.is_object_dtype(series): try: return series.isin({True, False}).all() except: return False return pdt.is_bool_dtype(series)
def contains_op(cls, series: pd.Series, state: dict) -> bool: # TODO: without the object check this passes string categories... is there a better way? if pdt.is_categorical_dtype(series): return False elif not pdt.is_object_dtype(series): return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) return series_is_string(series)
def _dtype_represents_categories(series) -> bool: "Determines if the dtype of the series represents categorical values" return ( is_bool_dtype(series) or is_object_dtype(series) or is_string_dtype(series) or is_categorical_dtype(series) )
def fit_transform(self, X, y=None, **fit_params): # preserve mlm_dtypes if it exists try: self.meta_mlm_dtypes = X.mlm_dtypes self.no_meta_mlm_dtypes = False except AttributeError: self.no_meta_mlm_dtypes = True pass self._validate_transformers() result = Parallel(n_jobs=self.n_jobs)( delayed(_fit_transform_one)( transformer=trans, X=X, y=y, weight=weight, **fit_params) for name, trans, weight in self._iter()) if not result: # All transformers are None return np.zeros((X.shape[0], 0)) Xs, transformers = zip(*result) self._update_transformer_list(transformers) if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: Xs = self.merge_dataframes_by_column(Xs) if not self.no_meta_mlm_dtypes: Xs = Xs.loc[:, ~Xs.columns.duplicated()] Xs = PreserveMetaData(Xs) Xs.mlm_dtypes = self.meta_mlm_dtypes # reset dtype for any columns that were turned into object columns for mlm_dtype in Xs.mlm_dtypes.keys(): for column in Xs.mlm_dtypes[mlm_dtype]: try: if is_object_dtype(Xs[column]): if mlm_dtype == "boolean": Xs[column] = Xs[column].astype("boolean") elif mlm_dtype == "continuous": Xs[column] = Xs[column].astype("float64") elif mlm_dtype == "category": Xs[column] = Xs[column].astype("category") elif mlm_dtype == "count": Xs[column] = Xs[column].astype("int64") elif mlm_dtype == "date": Xs[column] = Xs[column].astype("datetime64[ns]") elif mlm_dtype == "nominal": Xs[column] = Xs[column].astype("category") elif mlm_dtype == "ordinal": Xs[column] = Xs[column].astype("category") except KeyError: continue return Xs
def contains_op(series: pd.Series, state: dict) -> bool: is_valid_dtype = pdt.is_categorical_dtype( series) and not pdt.is_bool_dtype(series) if is_valid_dtype: return True elif not pdt.is_object_dtype(series): return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) return series_is_string(series, state)
def getFeatureCategorical(data): import pandas.api.types as types import Tools feature_categorical = [] for column in list(data.columns): if types.is_object_dtype(data[column]): feature_categorical.append(column) return feature_categorical
def contains_op(cls, series: pd.Series) -> bool: is_object = pdt.is_object_dtype(series) if is_object: ret = True elif pandas_has_string_dtype_flag: ret = pdt.is_string_dtype(series) and not pdt.is_categorical_dtype(series) else: ret = False return ret
def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: if copy: return self.copy() return self elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) return np.array(self.data, dtype=dtype, copy=copy)
def df_normalize_strings(df): for col in df.columns: if is_string_dtype(df[col]) or is_object_dtype(df[col]): df[col] = df[col].str.lower() df[col] = df[col].fillna(np.nan) # make None -> np.nan df[col] = df[col].replace('none or unspecified', np.nan) df[col] = df[col].replace('none', np.nan) df[col] = df[col].replace('#name?', np.nan) df[col] = df[col].replace('', np.nan)
def format_missings(df): for column in df.columns: if is_numeric_dtype(df[column]): fill_value = df[column].mean() df[column] = df[column].fillna(fill_value, downcast=False) elif is_object_dtype(df[column]) or is_string_dtype(df[column]): df[column] = df[column].fillna('MISSING', downcast=False) print("Shape after format_missing:", df.shape) return df
def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: pass return False
def object_is_bool(series: pd.Series, state) -> bool: if pdt.is_object_dtype(series): bool_set = {True, False} try: ret = all(item in bool_set for item in series) except: ret = False return ret return False
def _is_datetime(s): if is_datetime64_any_dtype(s): return True try: if is_object_dtype(s): pd.to_datetime(s, infer_datetime_format=True) return True except Exception: # pylint: disable=broad-except pass return False
def contains_op(cls, series: pd.Series) -> bool: # TODO: without the object check this passes string categories... is there a better way? if not pdt.is_object_dtype(series): return False elif series.hasnans: series = series.dropna() if series.empty: return False return all(isinstance(v, str) for v in series)
def is_object(self): """ Return if the current index type is a object type. Examples -------- >>> ks.DataFrame({'a': [1]}, index=["a"]).index.is_object() True """ return is_object_dtype(self.dtype)
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8*int_dtype.itemsize - 1 min_representable = -2 ** exponent max_representable = (2 ** exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ bad_dtypes = [] bad_dates = [] errors = [] for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: actual = df.dtypes[c] desired = dtypes[c] if is_float_dtype(actual) and is_integer_dtype(desired): bad_dtypes.append((c, actual, desired)) elif is_object_dtype(actual) and is_datetime64_any_dtype(desired): # This can only occur when parse_dates is specified, but an # invalid date is encountered. Pandas then silently falls back # to object dtype. Since `object_array.astype(datetime)` will # silently overflow, error here and report. bad_dates.append(c) else: try: df[c] = df[c].astype(dtypes[c]) except Exception as e: bad_dtypes.append((c, actual, desired)) errors.append((c, e)) if bad_dtypes: if errors: ex = '\n'.join("- %s\n %r" % (c, e) for c, e in sorted(errors, key=lambda x: str(x[0]))) exceptions = ("The following columns also raised exceptions on " "conversion:\n\n%s\n\n") % ex extra = "" else: exceptions = "" # All mismatches are int->float, also suggest `assume_missing=True` extra = ("\n\nAlternatively, provide `assume_missing=True` " "to interpret\n" "all unspecified integer columns as floats.") bad_dtypes = sorted(bad_dtypes, key=lambda x: str(x[0])) table = asciitable(['Column', 'Found', 'Expected'], bad_dtypes) dtype_kw = ('dtype={%s}' % ',\n' ' '.join("%r: '%s'" % (k, v) for (k, v, _) in bad_dtypes)) dtype_msg = ( "{table}\n\n" "{exceptions}" "Usually this is due to dask's dtype inference failing, and\n" "*may* be fixed by specifying dtypes manually by adding:\n\n" "{dtype_kw}\n\n" "to the call to `read_csv`/`read_table`." "{extra}").format(table=table, exceptions=exceptions, dtype_kw=dtype_kw, extra=extra) else: dtype_msg = None if bad_dates: also = " also " if bad_dtypes else " " cols = '\n'.join("- %s" % c for c in bad_dates) date_msg = ( "The following columns{also}failed to properly parse as dates:\n\n" "{cols}\n\n" "This is usually due to an invalid value in that column. To\n" "diagnose and fix it's recommended to drop these columns from the\n" "`parse_dates` keyword, and manually convert them to dates later\n" "using `dd.to_datetime`.").format(also=also, cols=cols) else: date_msg = None if bad_dtypes or bad_dates: rule = "\n\n%s\n\n" % ('-' * 61) msg = ("Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n\n" "%s" % (rule.join(filter(None, [dtype_msg, date_msg])))) raise ValueError(msg)
def _is_discrete(s): return (is_categorical_dtype(s) or is_object_dtype(s) and (force_nominal or s.nunique() < s.size**.666))
def universal_dataset_check(self, dataset_name, object_headers=None, numeric_headers=None, bool_headers=None, test_func=None): # "Hard" integrity checks that take a long time. # These tests only run if the MATMINER_DATASET_FULL_TEST # environment variable is set to True if do_complete_test: # Get rid of dataset if it's on the disk already data_path = os.path.join( self.dataset_dir, dataset_name + "." + self.dataset_dict[dataset_name][ 'file_type' ] ) if os.path.exists(data_path): os.remove(data_path) # Test that dataset can be downloaded load_dataset(dataset_name) self.assertTrue(os.path.exists(data_path)) # Test that data is now available and has all its elements df = load_dataset(dataset_name, download_if_missing=False) self.assertEqual( len(df), self.dataset_dict[dataset_name]["num_entries"] ) # Test all columns are there self.assertEqual(sorted(list(df)), sorted( [header for header in self.dataset_dict[dataset_name]['columns'].keys()] )) # Test each column for appropriate type if object_headers is None: object_headers = [] if numeric_headers is None: numeric_headers = [] if bool_headers is None: bool_headers = [] df = load_dataset(dataset_name, download_if_missing=False) if object_headers: self.assertTrue(is_object_dtype(df[object_headers].values)) if numeric_headers: self.assertTrue(is_numeric_dtype(df[numeric_headers].values)) if bool_headers: self.assertTrue(is_bool_dtype(df[bool_headers].values)) # Make sure all columns are accounted for column_headers = object_headers + numeric_headers + bool_headers self.assertEqual(sorted(list(df)), sorted(column_headers)) # Run tests unique to the dataset if test_func is not None: test_func(df) # "Soft" check that just makes sure the dataset download page is active # This runs when on a system with the CI environment var present # (e.g. when running a continuous integration VCS system) else: download_page = requests.head( self.dataset_dict[dataset_name]["url"] ) self.assertTrue(download_page.ok)
def pandas_to_table(df): # type: (pd.DataFrame) -> Orange.data.Table """ Convert a pandas.DataFrame to a Orange.data.Table instance. """ index = df.index if not isinstance(index, pd.RangeIndex): df = df.reset_index() columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] for header, series in df.items(): # type: (Any, pd.Series) if pdtypes.is_categorical(series): coldata = series.values # type: pd.Categorical categories = [str(c) for c in coldata.categories] var = Orange.data.DiscreteVariable.make( str(header), values=categories, ordered=coldata.ordered ) # Remap the coldata into the var.values order/set coldata = pd.Categorical( coldata, categories=var.values, ordered=coldata.ordered ) codes = coldata.codes assert np.issubdtype(codes.dtype, np.integer) orangecol = np.array(codes, dtype=np.float) orangecol[codes < 0] = np.nan elif pdtypes.is_datetime64_any_dtype(series): # Check that this converts tz local to UTC series = series.astype(np.dtype("M8[ns]")) coldata = series.values # type: np.ndarray assert coldata.dtype == "M8[ns]" mask = np.isnat(coldata) orangecol = coldata.astype(np.int64) / 10 ** 9 orangecol[mask] = np.nan var = Orange.data.TimeVariable.make(str(header)) var.have_date = var.have_time = 1 elif pdtypes.is_object_dtype(series): coldata = series.values assert isinstance(coldata, np.ndarray) orangecol = coldata var = Orange.data.StringVariable.make(str(header)) elif pdtypes.is_integer_dtype(series): coldata = series.values var = Orange.data.ContinuousVariable.make(str(header)) var.number_of_decimals = 0 orangecol = coldata.astype(np.float64) elif pdtypes.is_numeric_dtype(series): orangecol = series.values.astype(np.float64) var = Orange.data.ContinuousVariable.make(str(header)) var._out_format = "%.15g" else: warnings.warn( "Column '{}' with dtype: {} skipped." .format(header, series.dtype), UserWarning ) continue columns.append((var, orangecol)) cols_x = [(var, col) for var, col in columns if var.is_primitive()] cols_m = [(var, col) for var, col in columns if not var.is_primitive()] variables = [v for v, _ in cols_x] if cols_x: X = np.column_stack([a for _, a in cols_x]) else: X = np.empty((df.shape[0], 0), dtype=np.float) metas = [v for v, _ in cols_m] if cols_m: M = np.column_stack([a for _, a in cols_m]) else: M = None domain = Orange.data.Domain(variables, metas=metas) return Orange.data.Table.from_numpy(domain, X, None, M)