Exemple #1
0
def time_delta_contains(series: pd.Series, state: dict) -> bool:
    """
    Example:
        >>> x = pd.Series([pd.Timedelta(days=i) for i in range(3)])
        >>> x in visions.Timedelta
        True
    """
    return pdt.is_timedelta64_dtype(series)
Exemple #2
0
 def time_ref_unset(self) -> xr.DataArray:
     """Convert Timedelta + reference Timestamp to DatetimeIndex."""
     da = self._obj.copy()
     time_ref = da.weldx.time_ref
     if time_ref and is_timedelta64_dtype(da.time):
         da["time"] = da.time.data + time_ref
         da.time.attrs = self._obj.time.attrs  # restore old attributes !
     return da
Exemple #3
0
def is_timedelta_dtype(df: pd.DataFrame) -> pd.Series:
    """
    Check if each series in DataFrame is of a timedelta dtype.

    Wrapper function to allow function to be applied on the entire dataframe
    instead of a series level. This is a workaround to dill which fails to pickle
    local contexts in nested lambda statements.
    """
    return df.apply(
        lambda s: types.is_timedelta64_dtype(s), result_type="expand"
    )
def pandas_col_to_ibis_type(col):
    import numpy as np
    dty = col.dtype

    # datetime types
    if pdcom.is_datetime64tz_dtype(dty):
        return dt.Timestamp(str(dty.tz))

    if pdcom.is_datetime64_dtype(dty):
        if pdcom.is_datetime64_ns_dtype(dty):
            return dt.timestamp
        else:
            raise com.IbisTypeError("Column {0} has dtype {1}, which is "
                                    "datetime64-like but does "
                                    "not use nanosecond units".format(
                                        col.name, dty))
    if pdcom.is_timedelta64_dtype(dty):
        print("Warning: encoding a timedelta64 as an int64")
        return dt.int64

    if pdcom.is_categorical_dtype(dty):
        return dt.Category(len(col.cat.categories))

    if pdcom.is_bool_dtype(dty):
        return dt.boolean

    # simple numerical types
    if issubclass(dty.type, np.int8):
        return dt.int8
    if issubclass(dty.type, np.int16):
        return dt.int16
    if issubclass(dty.type, np.int32):
        return dt.int32
    if issubclass(dty.type, np.int64):
        return dt.int64
    if issubclass(dty.type, np.float32):
        return dt.float
    if issubclass(dty.type, np.float64):
        return dt.double
    if issubclass(dty.type, np.uint8):
        return dt.int16
    if issubclass(dty.type, np.uint16):
        return dt.int32
    if issubclass(dty.type, np.uint32):
        return dt.int64
    if issubclass(dty.type, np.uint64):
        raise com.IbisTypeError("Column {} is an unsigned int64".format(
            col.name))

    if pdcom.is_object_dtype(dty):
        return _infer_object_dtype(col)

    raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
Exemple #5
0
    def from_values(cls, initial_value, values=None, closed="left"):
        """
        Construct :class:`Stairs` from :class:`pandas.Series`.

        Parameters
        ----------
        initial_value : float, default 0
            The value of the step function at negative infinity.
        values : :class:`pandas.Series`
            The step function values' when approaching the change points from the right
        closed : {"left", "right"}
            Indicates whether the half-open intervals comprising the step function should be interpreted
            as left-closed or right-closed.

        Returns
        -------
        :class:`Stairs`
        """

        if not isinstance(values, pd.Series) or values.empty:
            raise ValueError("values must be a not empty Series")

        if not (is_numeric_dtype(values.index) or is_datetime64_dtype(
                values.index) or is_timedelta64_dtype(values.index)):
            warnings.warn("The index of data is not numeric, or time based")

        if np.isinf(values.index).any():
            raise ValueError("Invalid value for Series index")

        if not is_numeric_dtype(values) or not is_number(initial_value):
            raise ValueError("Invalid dtype for from_values()")

        if not values.index.is_monotonic_increasing:
            raise ValueError("Series index must be monotonic")

        series_values_inf_mask = np.isinf(values)
        if series_values_inf_mask.any():
            values = values.replace([np.inf], np.nan)
            warnings.warn(
                "Infinity values detected and have been converted to NaN")

        new_instance = cls(closed=closed)
        new_instance.initial_value = initial_value
        new_instance._data = values.to_frame("value")
        new_instance._valid_deltas = False
        new_instance._valid_values = True
        return new_instance
Exemple #6
0
    def time_ref(self, value: pd.Timestamp):
        """Convert INPLACE to new reference time.

        If no reference time exists, the new value will be assigned
        TODO: should None be allowed and pass through or raise TypeError ?
        """
        if "time" in self._obj.coords:
            value = _as_valid_timestamp(value)
            if self._obj.weldx.time_ref and is_timedelta64_dtype(self._obj.time):
                if value == self._obj.weldx.time_ref:
                    return
                _attrs = self._obj.time.attrs
                time_delta = value - self._obj.weldx.time_ref
                self._obj["time"] = self._obj.time.data - time_delta
                self._obj.time.attrs = _attrs  # restore old attributes !
                self._obj.time.attrs["time_ref"] = value  # set new time_ref value
            else:
                self._obj.time.attrs["time_ref"] = value
Exemple #7
0
def _spacing(da, dims):
    """
    Verify correct spacing and return the spacing for each axis
    :param da:
    :return:
    """
    delta_x = []
    for d in dims:
        coord = da[d]
        diff = np.diff(coord)
        if is_timedelta64_dtype(diff):
            # convert to seconds so we get hertz
            diff = diff.astype('timedelta64[s]').astype('f8')
        delta = diff[0]
        if not np.allclose(diff, diff[0]):
            raise ValueError("Can't take Fourier transform because"
                             "coodinate %s is not evenly spaced" % d)
        delta_x.append(delta)

    return delta_x
Exemple #8
0
def pandas_iter(
        df: pd.DataFrame,
        columns: List[str],
        mask: Optional[np.array] = None
) -> Generator[List[Any], None, None]:
    arrays = []

    for column in columns:
        srs = df.loc[:, column]

        if mask is not None:
            srs = srs[mask]

        if is_datetime64_any_dtype(srs) or is_datetime64_ns_dtype(srs):
            arrays.append(map(pd.Timestamp, srs.values))
        elif is_timedelta64_dtype(srs) or is_timedelta64_ns_dtype(srs):
            arrays.append(map(pd.Timedelta, srs.values))
        else:
            arrays.append(srs.values)
    yield from zip(*arrays)
Exemple #9
0
    def time_ref(self, value: types_timestamp_like):
        """Convert INPLACE to new reference time.

        If no reference time exists, the new value will be assigned.
        """
        if value is None:
            raise TypeError("'None' is not allowed as value.")
        if "time" in self._obj.coords:
            value = Time(value).as_timestamp()
            if self._obj.weldx.time_ref and is_timedelta64_dtype(
                    self._obj.time):
                if value == self._obj.weldx.time_ref:
                    return
                _attrs = self._obj.time.attrs
                time_delta = value - self._obj.weldx.time_ref
                self._obj["time"] = self._obj.time.data - time_delta
                self._obj.time.attrs = _attrs  # restore old attributes !
                self._obj.time.attrs[
                    "time_ref"] = value  # set new time_ref value
            else:
                self._obj.time.attrs["time_ref"] = value
Exemple #10
0
def to_pandas_time_index(
    time: Union[
        pint.Quantity,
        np.ndarray,
        pd.TimedeltaIndex,
        pd.DatetimeIndex,
        xr.DataArray,
        "tf.LocalCoordinateSystem",
    ],
) -> Union[pd.TimedeltaIndex, pd.DatetimeIndex]:
    """Convert a time variable to the corresponding pandas time index type.

    Parameters
    ----------
    time :
        Variable that should be converted.

    Returns
    -------
    Union[pandas.TimedeltaIndex, pandas.DatetimeIndex] :
        Time union of all input objects

    """
    from weldx.transformations import LocalCoordinateSystem

    _input_type = type(time)

    if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)):
        return time

    if isinstance(time, LocalCoordinateSystem):
        return to_pandas_time_index(time.time)

    if isinstance(time, pint.Quantity):
        base = "s"  # using low base unit could cause rounding errors
        if not np.iterable(time):  # catch zero-dim arrays
            time = np.expand_dims(time, 0)
        return pd.TimedeltaIndex(data=time.to(base).magnitude, unit=base)

    if isinstance(time, (xr.DataArray, xr.Dataset)):
        if "time" in time.coords:
            time = time.time
        time_index = pd.Index(time.values)
        if is_timedelta64_dtype(time_index) and time.weldx.time_ref:
            time_index = time_index + time.weldx.time_ref
        return time_index

    if not np.iterable(time) or isinstance(time, str):
        time = [time]
    time = pd.Index(time)

    if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)):
        return time

    # try manual casting for object dtypes (i.e. strings), should avoid integers
    # warning: this allows something like ["1","2","3"] which will be ns !!
    if is_object_dtype(time):
        for func in (pd.DatetimeIndex, pd.TimedeltaIndex):
            try:
                return func(time)
            except (ValueError, TypeError):
                continue

    raise TypeError(
        f"Could not convert {_input_type} " f"to pd.DatetimeIndex or pd.TimedeltaIndex"
    )
Exemple #11
0
def timedelta_func(series):
    if pdtypes.is_timedelta64_dtype(series.dtype):
        return True
    return False
Exemple #12
0
def censor(x, range=(0, 1), only_finite=True):
    """
    Convert any values outside of range to a **NULL** type object.

    Parameters
    ----------
    x : array_like
        Values to manipulate
    range : tuple
        (min, max) giving desired output range
    only_finite : bool
        If True (the default), will only modify
        finite values.

    Returns
    -------
    x : array_like
        Censored array

    Examples
    --------
    >>> a = [1, 2, np.inf, 3, 4, -np.inf, 5]
    >>> censor(a, (0, 10))
    [1, 2, inf, 3, 4, -inf, 5]
    >>> censor(a, (0, 10), False)
    [1, 2, nan, 3, 4, nan, 5]
    >>> censor(a, (2, 4))
    [nan, 2, inf, 3, 4, -inf, nan]

    Notes
    -----
    All values in ``x`` should be of the same type. ``only_finite`` parameter
    is not considered for Datetime and Timedelta types.

    The **NULL** type object depends on the type of values in **x**.

    - :class:`float` - :py:`float('nan')`
    - :class:`int` - :py:`float('nan')`
    - :class:`datetime.datetime` : :py:`np.datetime64(NaT)`
    - :class:`datetime.timedelta` : :py:`np.timedelta64(NaT)`

    """
    if not len(x):
        return x

    py_time_types = (datetime.datetime, datetime.timedelta)
    np_pd_time_types = (pd.Timestamp, pd.Timedelta,
                        np.datetime64, np.timedelta64)
    x0 = first_element(x)

    # Yes, we want type not isinstance
    if type(x0) in py_time_types:
        return _censor_with(x, range, 'NaT')

    if not hasattr(x, 'dtype') and isinstance(x0, np_pd_time_types):
        return _censor_with(x, range, type(x0)('NaT'))

    x_array = np.asarray(x)
    if pdtypes.is_number(x0) and not isinstance(x0, np.timedelta64):
        null = float('nan')
    elif com.is_datetime_arraylike(x_array):
        null = pd.Timestamp('NaT')
    elif pdtypes.is_datetime64_dtype(x_array):
        null = np.datetime64('NaT')
    elif isinstance(x0, pd.Timedelta):
        null = pd.Timedelta('NaT')
    elif pdtypes.is_timedelta64_dtype(x_array):
        null = np.timedelta64('NaT')
    else:
        raise ValueError(
            "Do not know how to censor values of type "
            "{}".format(type(x0)))

    if only_finite:
        try:
            finite = np.isfinite(x)
        except TypeError:
            finite = np.repeat(True, len(x))
    else:
        finite = np.repeat(True, len(x))

    if hasattr(x, 'dtype'):
        outside = (x < range[0]) | (x > range[1])
        bool_idx = finite & outside
        x = x.copy()
        x[bool_idx] = null
    else:
        x = [null if not range[0] <= val <= range[1] and f else val
             for val, f in zip(x, finite)]

    return x
Exemple #13
0
    category_count = df[y].value_counts().count()
    if category_count == 1:
        # it is helpful to separate this case in order to save unnecessary calculation time
        return df, "target_is_constant"
    if _dtype_represents_categories(df[y]) and (category_count == len(df[y])):
        # it is important to separate this case in order to save unnecessary calculation time
        return df, "target_is_id"

    if _dtype_represents_categories(df[y]):
        return df, "classification"
    if is_numeric_dtype(df[y]):
        # this check needs to be after is_bool_dtype (which is part of _dtype_represents_categories) because bool is considered numeric by pandas
        return df, "regression"

    if is_datetime64_any_dtype(df[y]) or is_timedelta64_dtype(df[y]):
        # IDEA: show warning
        # raise TypeError(
        #     f"The target column {y} has the dtype {df[y].dtype} which is not supported. A possible solution might be to convert {y} to a string column"
        # )
        return df, "target_is_datetime"

    # IDEA: show warning
    # raise Exception(
    #     f"Could not infer a valid task based on the target {y}. The dtype {df[y].dtype} is not yet supported"
    # )  # pragma: no cover
    return df, "target_data_type_not_supported"


def _feature_is_id(df, x):
    "Returns Boolean if the feature column x is an ID"
Exemple #14
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0:  #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8 * int_dtype.itemsize - 1
            min_representable = -2**exponent
            max_representable = (2**exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds' %
                             (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n' %
                (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n' %
                (col_name, new_dtype, original_dtype))
        return col
Exemple #15
0
def is_timedelta(value):
    if isinstance(value, (list, tuple)):
        value = pd.Series(value)
    return is_timedelta64_dtype(value) or isinstance(value, timedelta)
Exemple #16
0
def duration(s1: pd.Series,
             s2: pd.Series = None,
             unit: Union[str, None] = None,
             round: Union[bool, int] = 2,
             freq: str = 'd') -> pd.Series:
    ''' calculate duration between two columns (series)

    Parameters
    ----------
    s1
        'from' datetime series
    s2
        'to' datetime series.
        Default None. If None, defaults to today.
    interval
        default None - returns timedelta in days
                'd' - days as an integer,
                'years' (based on 365.25 days per year),
                'months' (based on 30 day month)

        Other possible options are:
            - ‘W’, ‘D’, ‘T’, ‘S’, ‘L’, ‘U’, or ‘N’
            - ‘days’ or ‘day’
            - ‘hours’, ‘hour’, ‘hr’, or ‘h’
            - ‘minutes’, ‘minute’, ‘min’, or ‘m’
            - ‘seconds’, ‘second’, or ‘sec’
            - ‘milliseconds’, ‘millisecond’, ‘millis’, or ‘milli’
            - ‘microseconds’, ‘microsecond’, ‘micros’, or ‘micro’-
            - ‘nanoseconds’, ‘nanosecond’, ‘nanos’, ‘nano’, or ‘ns’.

        check out pandas
        `timedelta object <https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html>`_
        for details.
    round
        Default False. If duration result is an integer and this
        parameter contains a positive integer, the result is round to this
        decimal precision.
    freq
        Default is 'd'(days). If the duration result is a pd.Timedelta dtype,
        the value can be 'rounded' using this frequency parameter.

        Must be a fixed frequency like 'S' (second) not 'ME' (month end).
        For a list of valid values, check out
        `pandas offset aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_


    Returns
    -------
    series
        if unit is None - series is of data type timedelta64[ns]
        otherwise series of type int.

    Examples
    --------

    .. code-block::

        %%piper
        sample_data()
        >> select(['-countries', '-regions', '-ids', '-values_1', '-values_2'])
        >> assign(new_date_col=pd.to_datetime('2018-01-01'))
        >> assign(duration = lambda x: duration(x.new_date_col, x.order_dates, unit='months'))
        >> assign(duration_dates_age = lambda x: duration(x['dates']))
        >> head(tablefmt='plain')

            dates       rder_dates new_date_col duration duration_dates_age
         0  2020-01-01  2020-01-07   2018-01-01       25           452 days
         1  2020-01-02  2020-01-08   2018-01-01       25           451 days
         2  2020-01-03  2020-01-09   2018-01-01       25           450 days
         3  2020-01-04  2020-01-10   2018-01-01       25           449 days

    '''
    if s2 is None:
        s2 = datetime.today()

    if unit is None:
        result = s2 - s1
    elif unit == 'years':
        result = ((s2 - s1) / pd.Timedelta(365.25, 'd'))
    elif unit == 'months':
        result = ((s2 - s1) / pd.Timedelta(30, 'd'))
    else:
        result = ((s2 - s1)) / pd.Timedelta(1, unit)

    if is_numeric_dtype(result):
        result = result.round(round)
    elif is_timedelta64_dtype(result):
        result = result.dt.round(freq=freq)

    return result
Exemple #17
0
 def contains_op(cls, series: pd.Series) -> bool:
     return pdt.is_timedelta64_dtype(series)
Exemple #18
0
def convert_col_dtype(col, int_to_category=True, force_fp32=True):
    """Convert datatypes for columns according to "sensible" rules for the
    tasks in this module:

    * integer types are reduced to smallest integer type without losing
      information, or to a categorical if that uses less memory (roughly)
    * float types are all made the same: either the type of the first element,
      or all are reduced to single precision
    * object types that contain strings are converted to categoricals
    * object types that contain numbers are converted according to the rules
      above to either floats, shortest-possible ints, or a categorical
    * bool types are forced to ``numpy.dtype('bool')``

    Parameters
    ----------
    col : pandas.Series
        Column

    int_to_category : bool
        Whether to convert integer types to categoricals in the case that this
        will save memory.

    force_fp32 : bool
        Force all floating-point data types to be single precision (fp32). If
        False, the type of the first element is used instead (for all values in
        the column).

    Returns
    -------
    col : pandas.Series

    """
    from pisa.utils.fileio import fsort

    categorical_dtype = CategoricalDtype()

    recognized_dtype = False
    original_dtype = col.dtype
    col_name = col.name

    if len(col) == 0: #pylint: disable=len-as-condition
        return col

    first_item = col.iloc[0]

    # Default: keep current dtype
    new_dtype = original_dtype

    if (is_categorical_dtype(original_dtype)
            or is_datetime64_any_dtype(original_dtype)
            or is_timedelta64_dtype(original_dtype)
            or is_timedelta64_ns_dtype(original_dtype)):
        recognized_dtype = True
        new_dtype = original_dtype
    elif is_object_dtype(original_dtype):
        if isinstance(first_item, basestring):
            recognized_dtype = True
            new_dtype = categorical_dtype
        # NOTE: Must check bool before int since bools look like ints (but not
        # vice versa)
        elif isinstance(first_item, BOOL_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('bool')
        elif isinstance(first_item, INT_TYPES + UINT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype('int')
        elif isinstance(first_item, FLOAT_TYPES):
            recognized_dtype = True
            new_dtype = np.dtype(type(first_item))

    # Convert ints to either shortest int possible or categorical,
    # whichever is smaller (use int if same size)
    if new_dtype in INT_DTYPES + UINT_DTYPES:
        recognized_dtype = True
        # See how large an int would be necessary
        col_min, col_max = col.min(), col.max()
        found_int_dtype = False
        int_dtype = None
        for int_dtype in INT_DTYPES:
            exponent = 8*int_dtype.itemsize - 1
            min_representable = -2 ** exponent
            max_representable = (2 ** exponent) - 1
            if col_min >= min_representable and col_max <= max_representable:
                found_int_dtype = True
                break
        if not found_int_dtype:
            raise ValueError('Value(s) in column "%s" exceed %s bounds'
                             % (col_name, int_dtype))

        # Check if categorical is probably smaller than int dtype; note that
        # the below is not perfect (i.e. is not based on exact internal
        # representation of categoricals in Pandas...) but should get us pretty
        # close, so that at least order-of-magnitude efficiencies will be
        # found)
        if int_to_category:
            num_unique = len(col.unique())
            category_bytes = int(np.ceil(np.log2(num_unique) / 8))
            if category_bytes < int_dtype.itemsize:
                new_dtype = categorical_dtype
            else:
                new_dtype = int_dtype

    elif new_dtype in FLOAT_DTYPES:
        recognized_dtype = True
        if force_fp32:
            new_dtype = np.dtype('float32')
        else:
            new_dtype = np.dtype(type(first_item))

    elif new_dtype in BOOL_DTYPES:
        recognized_dtype = True
        new_dtype = np.dtype('bool')

    if not recognized_dtype:
        wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"'
                ' and/or sub-type "%s"\n'
                % (col_name, original_dtype.name, type(first_item)))

    if is_dtype_equal(new_dtype, original_dtype):
        if isinstance(first_item, basestring):
            return col.cat.reorder_categories(fsort(col.cat.categories))
        return col

    if is_categorical_dtype(new_dtype):
        new_col = col.astype('category')
        if isinstance(first_item, basestring):
            new_col.cat.reorder_categories(fsort(new_col.cat.categories),
                                           inplace=True)
        return new_col

    try:
        return col.astype(new_dtype)
    except ValueError:
        wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping'
                ' original dtype "%s"\n'
                % (col_name, new_dtype, original_dtype))
        return col