def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]): # There is a mismatch between floating and integer columns. # Determine all mismatched and error. mismatched = sorted(c for c in df.columns if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c])) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "%s\n\n" "To fix, specify dtypes manually by adding:\n\n" "%s\n\n" "to the call to `read_csv`/`read_table`.\n\n" "Alternatively, provide `assume_missing=True` to " "interpret all unspecified integer columns as floats.") missing_list = '\n'.join('- %r' % c for c in mismatched) dtype_list = ('%r: float' % c for c in mismatched) missing_dict = 'dtype={%s}' % ',\n '.join(dtype_list) raise ValueError(msg % (missing_list, missing_dict)) df[c] = df[c].astype(dtypes[c])
def nankurt(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted4 = adjusted2 ** 2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 result = numer / denom - adj # floating point error numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1)**0.5 / (count - 2)) * (m3 / m2**1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isnull(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def _get_data_algo(values, func_map): f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_signed_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: try: f = func_map['string'] except KeyError: pass if f is None: f = func_map['object'] return f, values
def _hashtable_algo(f, values, return_dtype=None): """ f(HashTable, type_caster) -> result """ dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_signed_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_unsigned_integer_dtype(dtype): return f(htable.UInt64HashTable, _ensure_uint64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: return f(htable.StringHashTable, _ensure_object) # use Object return f(htable.PyObjectHashTable, _ensure_object)
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan self.assertEqual(df['c'].dtype, np.float64) df.loc[0, 'c'] = 'foo' expected = DataFrame([{ "a": 1, "c": 'foo' }, { "a": 3, "b": 2, "c": np.nan }]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_integer_dtype(left['foo'])) self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz']))
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period_type = (is_period_dtype(values) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if needs_i8_conversion(dtype) or is_period_type: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period_type: # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) keys, counts = htable.value_count_object(values, dropna) mask = isnull(values) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count)
def nansum(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) return _wrap_results(the_sum, dtype)
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _simple_new(cls, values, name=None, freq=None, **kwargs): """ Values can be any type that can be coerced to Periods. Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ if not is_integer_dtype(values): values = np.array(values, copy=False) if len(values) > 0 and is_float_dtype(values): raise TypeError("PeriodIndex can't take floats") return cls(values, name=name, freq=freq, **kwargs) return cls._from_ordinals(values, name, freq, **kwargs)
def fill_zeros(result, x, y, name, fill): """ if this is a reversed op, then flip x,y if we have an integer value (or array in y) and we have 0's, fill them with the fill, return the result mask the nan's from x """ if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: return result if is_scalar_type: y = np.array(y) if is_integer_dtype(y): if (y == 0).any(): # GH 7325, mask and nans must be broadcastable (also: PR 9308) # Raveling and then reshaping makes np.putmask faster mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape result = result.astype('float64', copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): signs = np.sign(y if name.startswith(('r', '__r')) else x) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) if "floordiv" in name: # (PR 9308) nan_mask = ((y == 0) & (x == 0)).ravel() np.putmask(result, nan_mask, np.nan) result = result.reshape(shape) return result
def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label - np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: raise TypeError('cannot determine next label for type %r' % type(label))
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan self.assertEqual(df['c'].dtype, np.float64) df.loc[0, 'c'] = 'foo' expected = DataFrame([{"a": 1, "c": 'foo'}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_integer_dtype(left['foo'])) self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz']))
def test_set_value(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): for label in self.panel4d.labels: for item in self.panel4d.items: for mjr in self.panel4d.major_axis[::2]: for mnr in self.panel4d.minor_axis: self.panel4d.set_value(label, item, mjr, mnr, 1.) assert_almost_equal( self.panel4d[label][item][mnr][mjr], 1.) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) self.assertTrue(is_float_dtype(res3['l4'].values)) # resize res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) tm.assertIsInstance(res, Panel4D) self.assertIsNot(res, self.panel4d) self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) self.assertTrue(is_float_dtype(res3['l4'].values))
def _simple_new(cls, left, right, closed=None, name=None, copy=False, verify_integrity=True): result = IntervalMixin.__new__(cls) if closed is None: closed = 'right' left = _ensure_index(left, copy=copy) right = _ensure_index(right, copy=copy) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) if is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): raise ValueError("must not have differing left [{}] " "and right [{}] types".format( type(left), type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " "use a PeriodIndex instead") result._left = left result._right = right result._closed = closed result.name = name if verify_integrity: result._validate() result._reset_identity() return result
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values. .. versionadded:: 0.19.0 Parameters ---------- values : ndarray-like Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_signed_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) duplicated = htable.duplicated_uint64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_float_dtype(dtype): values = self._values.astype(dtype, copy=copy) elif is_integer_dtype(dtype): if self.hasnans: raise ValueError('cannot convert float NaN to integer') values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % self.__class__) return Index(values, name=self.name, dtype=dtype)
def nanvar(values, axis=None, skipna=True, ddof=1): dtype = values.dtype mask = isnull(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = _values_from_object(values) if isfinite: mask = _isfinite(values) else: mask = isnull(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = _maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 tm.skip_if_no_package('scipy') # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(2, dtype=dtype) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = pd.SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(object) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal tm.assert_equal(dict(sdf.to_coo().todok()), dict(spm.todok())) # Ensure dtype is preserved if possible was_upcast = ((fill_value is None or is_float(fill_value)) and not is_object_dtype(dtype) and not is_float_dtype(dtype)) res_dtype = (bool if is_bool_dtype(dtype) else float if was_upcast else dtype) tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) tm.assert_equal(sdf.to_coo().dtype, res_dtype) # However, adding a str column results in an upcast to object sdf['strings'] = np.arange(len(sdf)).astype(str) tm.assert_equal(sdf.to_coo().dtype, np.object_)
def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, _ensure_object)
def _get_data_algo(values, func_map): if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: f = func_map['generic'] values = _ensure_object(values) return f, values
def test_set_value(self): for label in self.panel4d.labels: for item in self.panel4d.items: for mjr in self.panel4d.major_axis[::2]: for mnr in self.panel4d.minor_axis: self.panel4d.set_value(label, item, mjr, mnr, 1.0) assert_almost_equal(self.panel4d[label][item][mnr][mjr], 1.0) # resize res = self.panel4d.set_value("l4", "ItemE", "foo", "bar", 1.5) tm.assertIsInstance(res, Panel4D) self.assertIsNot(res, self.panel4d) self.assertEqual(res.get_value("l4", "ItemE", "foo", "bar"), 1.5) res3 = self.panel4d.set_value("l4", "ItemE", "foobar", "baz", 5) self.assertTrue(is_float_dtype(res3["l4"].values))
def _simple_new(cls, values, name=None, freq=None, **kwargs): if not is_integer_dtype(values): values = np.array(values, copy=False) if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) values = np.array(values, dtype='int64', copy=False) result = object.__new__(cls) result._data = values result.name = name if freq is None: raise ValueError('freq is not specified') result.freq = Period._maybe_convert_freq(freq) result._reset_identity() return result
def _maybe_cast_indexed(self, key): """ we need to cast the key, which could be a scalar or an array-like to the type of our subtype """ if isinstance(key, IntervalIndex): return key subtype = self.dtype.subtype if is_float_dtype(subtype): if is_integer(key): key = float(key) elif isinstance(key, (np.ndarray, Index)): key = key.astype('float64') elif is_integer_dtype(subtype): if is_integer(key): key = int(key) return key
def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object if _method is None: raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max dtype_count = np.float64 if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return _wrap_results(the_mean, dtype)
def nanmedian(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna) def get_median(x): mask = notnull(x) if not skipna and not mask.all(): return np.nan return algos.median(_values_from_object(x[mask])) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan if axis is None: values = values.ravel() notempty = values.size # an array from a frame if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: return _wrap_results(np.apply_along_axis(get_median, axis, values), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array shp = np.array(values.shape) dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype)
def nanmedian(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna) def get_median(x): mask = notnull(x) if not skipna and not mask.all(): return np.nan return algos.median(_values_from_object(x[mask])) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan if axis is None: values = values.ravel() notempty = values.size # an array from a frame if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: return _wrap_results( np.apply_along_axis(get_median, axis, values), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array shp = np.array(values.shape) dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype)
def convert(values, unit, axis): def try_parse(values): try: return _dt_to_float_ordinal(tools.to_datetime(values)) except Exception: return values if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, np.datetime64): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) elif isinstance(values, (list, tuple, np.ndarray, Index)): if isinstance(values, Index): values = values.values if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) if is_integer_dtype(values) or is_float_dtype(values): return values try: values = tools.to_datetime(values) if isinstance(values, Index): values = _dt_to_float_ordinal(values) else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: values = _dt_to_float_ordinal(values) return values
def convert(values, unit, axis): def try_parse(values): try: return _dt_to_float_ordinal(tools.to_datetime(values)) except Exception: return values if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, np.datetime64): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) elif isinstance(values, (list, tuple, np.ndarray, Index)): if isinstance(values, Index): values = values.values if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) if is_integer_dtype(values) or is_float_dtype(values): return values try: values = tools.to_datetime(values) if isinstance(values, Index): values = values.map(_dt_to_float_ordinal) else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: values = _dt_to_float_ordinal(values) return values