def warn_dtype_mismatch(left, right, left_on, right_on): """Checks for merge column dtype mismatches and throws a warning (#4574)""" if not isinstance(left_on, list): left_on = [left_on] if not isinstance(right_on, list): right_on = [right_on] if all(col in left.columns for col in left_on) and all( col in right.columns for col in right_on ): dtype_mism = [ ((lo, ro), left.dtypes[lo], right.dtypes[ro]) for lo, ro in zip(left_on, right_on) if not is_dtype_equal(left.dtypes[lo], right.dtypes[ro]) ] if dtype_mism: col_tb = asciitable( ("Merge columns", "left dtype", "right dtype"), dtype_mism ) warnings.warn( ( "Merging dataframes with merge column data " "type mismatches: \n{}\nCast dtypes explicitly to " "avoid unexpected results." ).format(col_tb) )
def equal_dtypes(a, b): if is_categorical_dtype(a) != is_categorical_dtype(b): return False if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-": return False if is_categorical_dtype(a) and is_categorical_dtype(b): if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories: return True return a == b return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal(a, b)
def add_relationship(self, relationship): """Add a new relationship between entities in the entityset Args: relationship (Relationship) : Instance of new relationship to be added. """ if relationship in self.relationships: logger.warning("Not adding duplicate relationship: %s", relationship) return self # _operations? # this is a new pair of entities child_e = relationship.child_entity child_v = relationship.child_variable.id if child_e.index == child_v: msg = "Unable to add relationship because child variable '{}' in '{}' is also its index" raise ValueError(msg.format(child_v, child_e.id)) parent_e = relationship.parent_entity parent_v = relationship.parent_variable.id if not isinstance(child_e[child_v], vtypes.Id): child_e.convert_variable_type(variable_id=child_v, new_type=vtypes.Id, convert_data=False) if not isinstance(parent_e[parent_v], vtypes.Index): parent_e.convert_variable_type(variable_id=parent_v, new_type=vtypes.Index, convert_data=False) # Empty dataframes (as a result of accessing Entity.metadata) # default to object dtypes for discrete variables, but # indexes/ids default to ints. In this case, we convert # the empty column's type to int if isinstance(child_e.df, pd.DataFrame) and \ (child_e.df.empty and child_e.df[child_v].dtype == object and is_numeric_dtype(parent_e.df[parent_v])): child_e.df[child_v] = pd.Series(name=child_v, dtype=np.int64) parent_dtype = parent_e.df[parent_v].dtype child_dtype = child_e.df[child_v].dtype msg = u"Unable to add relationship because {} in {} is Pandas dtype {}"\ u" and {} in {} is Pandas dtype {}." if not is_dtype_equal(parent_dtype, child_dtype): raise ValueError( msg.format(parent_v, parent_e.id, parent_dtype, child_v, child_e.id, child_dtype)) self.relationships.append(relationship) self.reset_data_description() return self
def add_relationship(self, relationship): """Add a new relationship between entities in the entityset Args: relationship (Relationship) : Instance of new relationship to be added. """ if relationship in self.relationships: logger.warning("Not adding duplicate relationship: %s", relationship) return self # _operations? # this is a new pair of entities child_e = relationship.child_entity child_v = relationship.child_variable.id parent_e = relationship.parent_entity parent_v = relationship.parent_variable.id if not isinstance(child_e[child_v], vtypes.Discrete): child_e.convert_variable_type(variable_id=child_v, new_type=vtypes.Id, convert_data=False) if not isinstance(parent_e[parent_v], vtypes.Discrete): parent_e.convert_variable_type(variable_id=parent_v, new_type=vtypes.Index, convert_data=False) parent_dtype = parent_e.df[parent_v].dtype child_dtype = child_e.df[child_v].dtype msg = "Unable to add relationship because {} in {} is Pandas dtype {}"\ " and {} in {} is Pandas dtype {}." if not is_dtype_equal(parent_dtype, child_dtype): raise ValueError( msg.format(parent_v, parent_e.name, parent_dtype, child_v, child_e.name, child_dtype)) self.relationships.append(relationship) self.index_data(relationship) return self
def has_dtypes(df, items): """ Assert that a DataFrame has ``dtypes`` as described in ``items``. Parameters ========== df: DataFrame items: dict A mapping of column names to: - functions (but **not** other callables!) that take a pandas.Series.dtype instance as input, and return ``True`` if the ``dtype`` is of the correct dtype and ``False`` otherwise, and/or - strings, corresponding to the possible output values of ``pd.api.types.infer_dtype``, and/or - dtypes, or strings that can be converted to dtypes. For example, ``'int32'`` turns into np.dtype('int32')``. Instead of a mapping, items may also be a single value from above. For example, ``'int32'`` check that all columns have dtype ``int32``. Returns ======= df : DataFrame Examples ========= .. code:: python import numpy as np import pandas as pd import engarde.checks as ck df = pd.DataFrame({'A': np.random.randint(0, 10, 10), 'B': np.random.randn(10)}) df = df.pipe(ck.has_dtypes, items={'A': np.int32, 'B': pd.api.types.is_float_dtype}) """ import types import typing from pandas.api.types import is_dtype_equal, infer_dtype if not isinstance(items, typing.Mapping): # check all columns for items items = {col_name: items for col_name in df.columns} infer_strings = { 'string', 'unicode', 'bytes', 'floating', 'integer', 'mixed-integer', 'mixed-integer-float', 'complex,', 'categorical', 'boolean', 'datetime64', 'datetime', 'date', 'timedelta64', 'timedelta', 'time', 'period', 'mixed', } for k, v in items.items(): dtype = df.dtypes[k] if isinstance(v, (types.FunctionType, types.BuiltinFunctionType)): result = v(dtype) if not isinstance(result, bool): msg = "The function for key {!r} must return a boolean, returned type {!r}" raise AssertionError(msg.format(k, type(result))) if not result: msg = "Column {!r} has the wrong dtype ({!r}) for function {!r}" raise AssertionError(msg.format(k, dtype, v.__name__)) elif v in infer_strings: inferred_dtype_str = infer_dtype(df[k]) if not inferred_dtype_str == v: msg = "Column {!r} expected {!r} for infer_dtype, got {!r}" raise AssertionError(msg.format(k, v, inferred_dtype_str)) elif not is_dtype_equal(dtype, v): msg = "Column {!r} is checked for dtype {!r}, had dtype {!r}" raise AssertionError(msg.format(k, v, dtype)) return df
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8 * int_dtype.itemsize - 1 min_representable = -2**exponent max_representable = (2**exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8*int_dtype.itemsize - 1 min_representable = -2 ** exponent max_representable = (2 ** exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col