Exemple #1
0
def warn_dtype_mismatch(left, right, left_on, right_on):
    """Checks for merge column dtype mismatches and throws a warning (#4574)"""

    if not isinstance(left_on, list):
        left_on = [left_on]
    if not isinstance(right_on, list):
        right_on = [right_on]

    if all(col in left.columns for col in left_on) and all(
        col in right.columns for col in right_on
    ):
        dtype_mism = [
            ((lo, ro), left.dtypes[lo], right.dtypes[ro])
            for lo, ro in zip(left_on, right_on)
            if not is_dtype_equal(left.dtypes[lo], right.dtypes[ro])
        ]

        if dtype_mism:
            col_tb = asciitable(
                ("Merge columns", "left dtype", "right dtype"), dtype_mism
            )

            warnings.warn(
                (
                    "Merging dataframes with merge column data "
                    "type mismatches: \n{}\nCast dtypes explicitly to "
                    "avoid unexpected results."
                ).format(col_tb)
            )
Exemple #2
0
 def equal_dtypes(a, b):
     if is_categorical_dtype(a) != is_categorical_dtype(b):
         return False
     if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-":
         return False
     if is_categorical_dtype(a) and is_categorical_dtype(b):
         if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories:
             return True
         return a == b
     return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal(a, b)
Exemple #3
0
    def add_relationship(self, relationship):
        """Add a new relationship between entities in the entityset

        Args:
            relationship (Relationship) : Instance of new
                relationship to be added.
        """
        if relationship in self.relationships:
            logger.warning("Not adding duplicate relationship: %s",
                           relationship)
            return self

        # _operations?

        # this is a new pair of entities
        child_e = relationship.child_entity
        child_v = relationship.child_variable.id
        if child_e.index == child_v:
            msg = "Unable to add relationship because child variable '{}' in '{}' is also its index"
            raise ValueError(msg.format(child_v, child_e.id))
        parent_e = relationship.parent_entity
        parent_v = relationship.parent_variable.id
        if not isinstance(child_e[child_v], vtypes.Id):
            child_e.convert_variable_type(variable_id=child_v,
                                          new_type=vtypes.Id,
                                          convert_data=False)

        if not isinstance(parent_e[parent_v], vtypes.Index):
            parent_e.convert_variable_type(variable_id=parent_v,
                                           new_type=vtypes.Index,
                                           convert_data=False)
        # Empty dataframes (as a result of accessing Entity.metadata)
        # default to object dtypes for discrete variables, but
        # indexes/ids default to ints. In this case, we convert
        # the empty column's type to int
        if isinstance(child_e.df, pd.DataFrame) and \
                (child_e.df.empty and child_e.df[child_v].dtype == object and
                 is_numeric_dtype(parent_e.df[parent_v])):
            child_e.df[child_v] = pd.Series(name=child_v, dtype=np.int64)

        parent_dtype = parent_e.df[parent_v].dtype
        child_dtype = child_e.df[child_v].dtype
        msg = u"Unable to add relationship because {} in {} is Pandas dtype {}"\
            u" and {} in {} is Pandas dtype {}."
        if not is_dtype_equal(parent_dtype, child_dtype):
            raise ValueError(
                msg.format(parent_v, parent_e.id, parent_dtype, child_v,
                           child_e.id, child_dtype))

        self.relationships.append(relationship)
        self.reset_data_description()
        return self
    def add_relationship(self, relationship):
        """Add a new relationship between entities in the entityset

        Args:
            relationship (Relationship) : Instance of new
                relationship to be added.
        """
        if relationship in self.relationships:
            logger.warning("Not adding duplicate relationship: %s",
                           relationship)
            return self

        # _operations?

        # this is a new pair of entities
        child_e = relationship.child_entity
        child_v = relationship.child_variable.id
        parent_e = relationship.parent_entity
        parent_v = relationship.parent_variable.id
        if not isinstance(child_e[child_v], vtypes.Discrete):
            child_e.convert_variable_type(variable_id=child_v,
                                          new_type=vtypes.Id,
                                          convert_data=False)

        if not isinstance(parent_e[parent_v], vtypes.Discrete):
            parent_e.convert_variable_type(variable_id=parent_v,
                                           new_type=vtypes.Index,
                                           convert_data=False)

        parent_dtype = parent_e.df[parent_v].dtype
        child_dtype = child_e.df[child_v].dtype
        msg = "Unable to add relationship because {} in {} is Pandas dtype {}"\
            " and {} in {} is Pandas dtype {}."
        if not is_dtype_equal(parent_dtype, child_dtype):
            raise ValueError(
                msg.format(parent_v, parent_e.name, parent_dtype, child_v,
                           child_e.name, child_dtype))

        self.relationships.append(relationship)
        self.index_data(relationship)
        return self
Exemple #5
0
def has_dtypes(df, items):
    """
    Assert that a DataFrame has ``dtypes`` as described in ``items``.

    Parameters
    ==========
    df: DataFrame
    items: dict
      A mapping of column names to:
      - functions (but **not** other  callables!) that take a pandas.Series.dtype instance as input, and
        return ``True`` if the ``dtype`` is of the correct dtype and ``False`` otherwise, and/or
      - strings, corresponding to the possible output values of ``pd.api.types.infer_dtype``, and/or
      - dtypes, or strings that can be converted to dtypes. For example, ``'int32'`` turns into
        np.dtype('int32')``.
      Instead of a mapping, items may also be a single value from above. For example, ``'int32'``
      check that all columns have dtype ``int32``.

    Returns
    =======
    df : DataFrame

    Examples
    =========

    .. code:: python
      import numpy as np
      import pandas as pd
      import engarde.checks as ck
      df = pd.DataFrame({'A': np.random.randint(0, 10, 10),
                         'B': np.random.randn(10)})
      df = df.pipe(ck.has_dtypes, items={'A': np.int32,
                                         'B': pd.api.types.is_float_dtype})
    """
    import types
    import typing
    from pandas.api.types import is_dtype_equal, infer_dtype

    if not isinstance(items, typing.Mapping):  # check all columns for items
        items = {col_name: items for col_name in df.columns}

    infer_strings = {
        'string',
        'unicode',
        'bytes',
        'floating',
        'integer',
        'mixed-integer',
        'mixed-integer-float',
        'complex,',
        'categorical',
        'boolean',
        'datetime64',
        'datetime',
        'date',
        'timedelta64',
        'timedelta',
        'time',
        'period',
        'mixed',
    }
    for k, v in items.items():
        dtype = df.dtypes[k]
        if isinstance(v, (types.FunctionType, types.BuiltinFunctionType)):
            result = v(dtype)
            if not isinstance(result, bool):
                msg = "The function for key  {!r} must return a boolean, returned type {!r}"
                raise AssertionError(msg.format(k, type(result)))
            if not result:
                msg = "Column {!r} has the wrong dtype ({!r}) for function {!r}"
                raise AssertionError(msg.format(k, dtype, v.__name__))
        elif v in infer_strings:
            inferred_dtype_str = infer_dtype(df[k])
            if not inferred_dtype_str == v:
                msg = "Column {!r} expected {!r} for infer_dtype, got {!r}"
                raise AssertionError(msg.format(k, v, inferred_dtype_str))
        elif not is_dtype_equal(dtype, v):
            msg = "Column {!r} is checked for dtype {!r}, had dtype {!r}"
            raise AssertionError(msg.format(k, v, dtype))
    return df
Exemple #6
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0:  #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8 * int_dtype.itemsize - 1
            min_representable = -2**exponent
            max_representable = (2**exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds' %
                             (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n' %
                (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n' %
                (col_name, new_dtype, original_dtype))
        return col
Exemple #7
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0: #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8*int_dtype.itemsize - 1
            min_representable = -2 ** exponent
            max_representable = (2 ** exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds'
                             % (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n'
                % (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n'
                % (col_name, new_dtype, original_dtype))
        return col