def test_float_subtype(self, start, end, freq): # Has float subtype if any of start/end/freq are float, even if all # resulting endpoints can safely be upcast to integers # defined from start/end/freq index = interval_range(start=start, end=end, freq=freq) result = index.dtype.subtype expected = 'int64' if is_integer(start + end + freq) else 'float64' assert result == expected # defined from start/periods/freq index = interval_range(start=start, periods=5, freq=freq) result = index.dtype.subtype expected = 'int64' if is_integer(start + freq) else 'float64' assert result == expected # defined from end/periods/freq index = interval_range(end=end, periods=5, freq=freq) result = index.dtype.subtype expected = 'int64' if is_integer(end + freq) else 'float64' assert result == expected # GH 20976: linspace behavior defined from start/end/periods index = interval_range(start=start, end=end, periods=5) result = index.dtype.subtype expected = 'int64' if is_integer(start + end) else 'float64' assert result == expected
def _evaluate_numeric_binop(self, other): if isinstance(other, ABCSeries): return NotImplemented elif isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): # GH#19333 is_integer evaluated True on timedelta64, # so we need to catch these explicitly if reversed: return op(other, self._int64index) return op(self._int64index, other) other = self._validate_for_numeric_binop(other, op, opstr) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) left, right = self, other if reversed: left, right = right, left try: # apply if we have an override if step: with np.errstate(all='ignore'): rstep = step(left._step, right) # we don't have a representable op # so return a base index if not is_integer(rstep) or not rstep: raise ValueError else: rstep = left._step with np.errstate(all='ignore'): rstart = op(left._start, right) rstop = op(left._stop, right) result = RangeIndex(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors if not all(is_integer(x) for x in [rstart, rstop, rstep]): result = result.astype('float64') return result except (ValueError, TypeError, AttributeError, ZeroDivisionError): # Defer to Int64Index implementation if reversed: return op(other, self._int64index) return op(self._int64index, other)
def _evaluate_numeric_binop(self, other): if isinstance(other, ABCSeries): return NotImplemented other = self._validate_for_numeric_binop(other, op, opstr) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) if reversed: self, other = other, self try: # apply if we have an override if step: with np.errstate(all='ignore'): rstep = step(self._step, other) # we don't have a representable op # so return a base index if not is_integer(rstep) or not rstep: raise ValueError else: rstep = self._step with np.errstate(all='ignore'): rstart = op(self._start, other) rstop = op(self._stop, other) result = RangeIndex(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors if not all(is_integer(x) for x in [rstart, rstop, rstep]): result = result.astype('float64') return result except (ValueError, TypeError, AttributeError): pass # convert to Int64Index ops if isinstance(self, RangeIndex): self = self.values if isinstance(other, RangeIndex): other = other.values with np.errstate(all='ignore'): results = op(self, other) return Index(results, **attrs)
def test_quantile_interpolation_dtype(self): # GH #10174 # interpolation = linear (default case) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q)
def get_freq_code(freqstr): """ Return freq str or tuple to freq code and stride (mult) Parameters ---------- freqstr : str or tuple Returns ------- return : tuple of base frequency code and stride (mult) Example ------- >>> get_freq_code('3D') (6000, 3) >>> get_freq_code('D') (6000, 1) >>> get_freq_code(('D', 3)) (6000, 3) """ if isinstance(freqstr, DateOffset): freqstr = (freqstr.rule_code, freqstr.n) if isinstance(freqstr, tuple): if (is_integer(freqstr[0]) and is_integer(freqstr[1])): # e.g., freqstr = (2000, 1) return freqstr else: # e.g., freqstr = ('T', 5) try: code = _period_str_to_code(freqstr[0]) stride = freqstr[1] except: if is_integer(freqstr[1]): raise code = _period_str_to_code(freqstr[1]) stride = freqstr[0] return code, stride if is_integer(freqstr): return (freqstr, 1) base, stride = _base_and_stride(freqstr) code = _period_str_to_code(base) return code, stride
def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string, cast it to timedelta according to resolution. Parameters ---------- label : object side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} Returns ------- label : object """ assert kind in ['ix', 'loc', 'getitem', None] if isinstance(label, compat.string_types): parsed = Timedelta(label) lbound = parsed.round(parsed.resolution) if side == 'left': return lbound else: return (lbound + to_offset(parsed.resolution) - Timedelta(1, 'ns')) elif ((is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label)): self._invalid_indexer('slice', label) return label
def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string or a datetime, cast it to Period.ordinal according to resolution. Parameters ---------- label : object side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} Returns ------- bound : Period or object Notes ----- Value of `side` parameter should be validated in caller. """ assert kind in ['ix', 'loc', 'getitem'] if isinstance(label, datetime): return Period(label, freq=self.freq) elif isinstance(label, compat.string_types): try: _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == 'left' else 1] except Exception: raise KeyError(label) elif is_integer(label) or is_float(label): self._invalid_indexer('slice', label) return label
def _value_with_fmt(self, val): """Convert numpy types to Python types for the Excel writers. Parameters ---------- val : object Value to be written into cells Returns ------- Tuple with the first element being the converted value and the second being an optional format """ fmt = None if is_integer(val): val = int(val) elif is_float(val): val = float(val) elif is_bool(val): val = bool(val) elif isinstance(val, datetime): fmt = self.datetime_format elif isinstance(val, date): fmt = self.date_format elif isinstance(val, timedelta): val = val.total_seconds() / float(86400) fmt = '0' else: val = compat.to_str(val) return val, fmt
def _maybe_convert_usecols(usecols): """ Convert `usecols` into a compatible format for parsing in `parsers.py`. Parameters ---------- usecols : object The use-columns object to potentially convert. Returns ------- converted : object The compatible format of `usecols`. """ if usecols is None: return usecols if is_integer(usecols): warnings.warn(("Passing in an integer for `usecols` has been " "deprecated. Please pass in a list of int from " "0 to `usecols` inclusive instead."), FutureWarning, stacklevel=2) return list(range(usecols + 1)) if isinstance(usecols, str): return _range2cols(usecols) return usecols
def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label Returns ------- loc : int """ try: return self._engine.get_loc(key) except KeyError: if is_integer(key): raise try: asdt, parsed, reso = parse_time_string(key, self.freq) key = asdt except TypeError: pass try: key = Period(key, freq=self.freq) except ValueError: # we cannot construct the Period # as we have an invalid type raise KeyError(key) try: ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal if tolerance is not None: tolerance = self._convert_tolerance(tolerance) return self._int64index.get_loc(ordinal, method, tolerance) except KeyError: raise KeyError(key)
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically # but ufunc may pass integer to _add_delta return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def random_state(state=None): """ Helper function for processing random_state arguments. Parameters ---------- state : int, np.random.RandomState, None. If receives an int, passes to np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. Default None. Returns ------- np.random.RandomState """ if is_integer(state): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state elif state is None: return np.random else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None")
def __getitem__(self, key): # TODO: Document difference from Series.__getitem__, deprecate, # and remove! if is_integer(key) and key not in self.index: return self._get_val_at(key) else: return super().__getitem__(key)
def __sub__(self, other): from pandas.core.index import Index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset if isinstance(other, TimedeltaIndex): return self._add_delta(-other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if not isinstance(other, TimedeltaIndex): raise TypeError("cannot subtract TimedeltaIndex and {typ}" .format(typ=type(other).__name__)) return self._add_delta(-other) elif isinstance(other, DatetimeIndex): return self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {typ1} and {typ2}" .format(typ1=type(self).__name__, typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): return self._add_delta(-other) elif is_integer(other): return self.shift(-other) elif isinstance(other, (datetime, np.datetime64)): return self._sub_datelike(other) elif isinstance(other, Period): return self._sub_period(other) else: # pragma: no cover return NotImplemented
def _convert_scalar_indexer(self, key, kind=None): """ we don't allow integer or float indexing on datetime-like when using loc Parameters ---------- key : label of the slice bound kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ assert kind in ['ix', 'loc', 'getitem', 'iloc', None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) if kind in ['loc'] and (is_int or is_flt): self._invalid_indexer('index', key) elif kind in ['ix', 'getitem'] and is_flt: self._invalid_indexer('index', key) return (super(DatetimeIndexOpsMixin, self) ._convert_scalar_indexer(key, kind=kind))
def _simple_new(cls, start, stop=None, step=None, name=None, dtype=None, **kwargs): result = object.__new__(cls) # handle passed None, non-integers if start is None and stop is None: # empty start, stop, step = 0, 0, 1 if start is None or not is_integer(start): try: return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) result._start = start result._stop = stop or 0 result._step = step or 1 result.name = name for k, v in compat.iteritems(kwargs): setattr(result, k, v) result._reset_identity() return result
def delete(self, loc): """ Make a new TimedeltaIndex with passed location(s) deleted. Parameters ---------- loc: int, slice or array of ints Indicate which sub-arrays to remove. Returns ------- new_index : TimedeltaIndex """ new_tds = np.delete(self.asi8, loc) freq = 'infer' if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if is_list_like(loc): loc = lib.maybe_indices_to_slice( ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq)
def __add__(self, other): from pandas.core.index import Index from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): return NotImplemented elif is_timedelta64_dtype(other): return self._add_delta(other) elif isinstance(other, (DateOffset, timedelta)): return self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects return self._add_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if hasattr(other, '_add_delta'): return other._add_delta(self) raise TypeError("cannot add TimedeltaIndex and {typ}" .format(typ=type(other))) elif is_integer(other): return self.shift(other) elif isinstance(other, (datetime, np.datetime64)): return self._add_datelike(other) elif isinstance(other, Index): return self._add_datelike(other) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") else: # pragma: no cover return NotImplemented
def get_loc(self, key, method=None, tolerance=None): if is_integer(key) and method is None and tolerance is None: new_key = int(key) try: return self._range.index(new_key) except ValueError: raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance)
def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] return type(self)(self._data[item], mask=self._mask[item], dtype=self.dtype)
def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) if is_integer(key) or is_float(key) or key is NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, use_rhs=use_rhs) return loc
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def convert(value, unit, axis): valid_types = (str, pydt.time) if (isinstance(value, valid_types) or is_integer(value) or is_float(value)): return time2num(value) if isinstance(value, Index): return value.map(time2num) if isinstance(value, (list, tuple, np.ndarray, Index)): return [time2num(x) for x in value] return value
def _maybe_cast_indexed(self, key): """ we need to cast the key, which could be a scalar or an array-like to the type of our subtype """ if isinstance(key, IntervalIndex): return key subtype = self.dtype.subtype if is_float_dtype(subtype): if is_integer(key): key = float(key) elif isinstance(key, (np.ndarray, Index)): key = key.astype('float64') elif is_integer_dtype(subtype): if is_integer(key): key = int(key) return key
def integer_arithmetic_method(self, other): op_name = op.__name__ mask = None if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented if getattr(other, 'ndim', 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif getattr(other, 'ndim', None) == 0: other = other.item() elif is_list_like(other): other = np.asarray(other) if not other.ndim: other = other.item() elif other.ndim == 1: if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError( "can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") # nans propagate if mask is None: mask = self._mask else: mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. if op_name == 'pow': mask = np.where(self == 1, False, mask) elif op_name == 'rpow': mask = np.where(other == 1, False, mask) with np.errstate(all='ignore'): result = op(self._data, other) # divmod returns a tuple if op_name == 'divmod': div, mod = result return (self._maybe_mask_result(div, mask, other, 'floordiv'), self._maybe_mask_result(mod, mask, other, 'mod')) return self._maybe_mask_result(result, mask, other, op_name)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ nv.validate_take(tuple(), kwargs) if axis: raise ValueError("axis must be 0, input was {axis}" .format(axis=axis)) if is_integer(indices): # return scalar return self[indices] indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, # self.fill_value may not be NaN if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) elif (n <= indices).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) else: if ((indices < -n) | (n <= indices)).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) indices = indices.astype(np.int32) if not (allow_fill and fill_value is not None): indices = indices.copy() indices[indices < 0] += n locs = self.sp_index.lookup_array(indices) indexer = np.arange(len(locs), dtype=np.int32) mask = locs != -1 if mask.any(): indexer = indexer[mask] new_values = self.sp_values.take(locs[mask]) else: indexer = np.empty(shape=(0, ), dtype=np.int32) new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) sp_index = _make_index(len(indices), indexer, kind=self.sp_index) return self._simple_new(new_values, sp_index, self.fill_value)
def get_datevalue(date, freq): if isinstance(date, Period): return date.asfreq(freq).ordinal elif isinstance(date, (compat.string_types, datetime, pydt.date, pydt.time)): return Period(date, freq).ordinal elif (is_integer(date) or is_float(date) or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: return None raise ValueError("Unrecognizable date '%s'" % date)
def get_datevalue(date, freq): if isinstance(date, Period): return date.asfreq(freq).ordinal elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)): return Period(date, freq).ordinal elif (is_integer(date) or is_float(date) or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: return None raise ValueError("Unrecognizable date '{date}'".format(date=date))
def _get_ind(self, y): if self.ind is None: # np.nanmax() and np.nanmin() ignores the missing values sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, np.nanmax(y) + 0.5 * sample_range, 1000) elif is_integer(self.ind): sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, np.nanmax(y) + 0.5 * sample_range, self.ind) else: ind = self.ind return ind
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datelike(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other) or is_period_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def test_is_integer(self): assert is_integer(1) assert is_integer(np.int64(1)) assert not is_integer(True) assert not is_integer(1.1) assert not is_integer(1 + 3j) assert not is_integer(np.bool(False)) assert not is_integer(np.bool_(False)) assert not is_integer(np.float64(1.1)) assert not is_integer(np.complex128(1 + 3j)) assert not is_integer(np.nan) assert not is_integer(None) assert not is_integer('x') assert not is_integer(datetime(2011, 1, 1)) assert not is_integer(np.datetime64('2011-01-01')) assert not is_integer(Timestamp('2011-01-01')) assert not is_integer(Timestamp('2011-01-01', tz='US/Eastern')) assert not is_integer(timedelta(1000)) assert not is_integer(Timedelta('1 days')) # questionable assert is_integer(np.timedelta64(1, 'D'))
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix'): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default is 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. .. versionadded: 0.20.0 Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) 0 1960-01-02 1 1960-01-03 2 1960-01-04 See also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_timedelta : Convert argument to timedelta. """ from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return None # handle origin if origin == 'julian': original = arg j0 = tslib.Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = tslib.Timestamp.max.to_julian_date() - j0 j_min = tslib.Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslib.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) elif origin not in ['unix', 'julian']: # arg must be a numeric original = arg if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = tslib.Timestamp(origin) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) if offset.tz is not None: raise ValueError( "origin offset {} must be tz-naive".format(offset)) offset -= tslib.Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslib.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] return result
def integer_arithmetic_method(self, other): omask = None if getattr(other, "ndim", 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, omask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if len(self) != len(other): raise ValueError("Lengths must match") if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError("can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") if omask is None: mask = self._mask.copy() if other is libmissing.NA: mask |= True else: mask = self._mask | omask if op_name == "pow": # 1 ** x is 1. mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) if other is libmissing.NA: result = np.ones_like(self._data) else: with np.errstate(all="ignore"): result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": div, mod = result return ( self._maybe_mask_result(div, mask, other, "floordiv"), self._maybe_mask_result(mod, mask, other, "mod"), ) return self._maybe_mask_result(result, mask, other, op_name)
def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. Parameters ---------- dtype : np.dtype or ExtensionDtype fill_value : scalar, default np.nan Returns ------- dtype Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like raise ValueError("fill_value must be a scalar") # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = fill_value.dtype.type("NaT", "ns") else: # we need to change to object type as our # fill_value is of object type if fill_value.dtype == np.object_: dtype = np.dtype(np.object_) fill_value = np.nan if dtype == np.object_ or dtype.kind in ["U", "S"]: # We treat string-like dtypes as object, and _always_ fill # with np.nan fill_value = np.nan dtype = np.dtype(np.object_) # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): if (is_integer(fill_value) or (is_float(fill_value) and not np.isnan(fill_value)) or isinstance(fill_value, str)): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: try: fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: if fv is NaT: # NaT has no `to_timedelta64` method fill_value = np.timedelta64("NaT", "ns") else: fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT elif not isinstance(fill_value, datetime): dtype = np.dtype(np.object_) elif fill_value.tzinfo is None: dtype = np.dtype(np.object_) elif not tz_compare(fill_value.tzinfo, dtype.tz): # TODO: sure we want to cast here? dtype = np.dtype(np.object_) elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) elif dtype.kind == "f": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.float64 and dtype is np.float32 dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): dtype = np.float64 fill_value = np.nan elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) fill_value = np.nan else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): """ Parse specified sheet(s) into a DataFrame Equivalent to read_excel(ExcelFile, ...) See the read_excel docstring for more info on accepted parameters """ # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: warnings.warn( "The `sheetname` keyword is deprecated, use " "`sheet_name` instead", FutureWarning, stacklevel=2) sheet_name = kwds.pop("sheetname") elif 'sheetname' in kwds: raise TypeError("Cannot specify both `sheet_name` " "and `sheetname`. Use just `sheet_name`") if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") return self._reader.parse(sheet_name=sheet_name, header=header, names=names, index_col=index_col, usecols=usecols, squeeze=squeeze, converters=converters, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, **kwds)
def _get_string_slice(self, key): if is_integer(key) or is_float(key) or key is NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key) return loc
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def __init__(self, data, x, y, C=None, **kwargs): super().__init__(data, x, y, **kwargs) if is_integer(C) and not self.data.columns.holds_integer(): C = self.data.columns[C] self.C = C
def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label. Parameters ---------- key : Period, NaT, str, or datetime String or datetime key must be parsable as Period. Returns ------- loc : int or ndarray[int64] Raises ------ KeyError Key is not present in the index. TypeError If key is listlike or otherwise not hashable. """ orig_key = key if not is_scalar(key): raise InvalidIndexError(key) if isinstance(key, str): try: loc = self._get_string_slice(key) return loc except (TypeError, ValueError): pass try: asdt, reso = parse_time_string(key, self.freq) except DateParseError as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err reso = Resolution.from_attrname(reso) grp = reso.freq_group freqn = get_freq_group(self.dtype.dtype_code) # _get_string_slice will handle cases where grp < freqn assert grp >= freqn if grp == freqn: key = Period(asdt, freq=self.freq) loc = self.get_loc(key, method=method, tolerance=tolerance) return loc elif method is None: raise KeyError(key) else: key = asdt elif is_integer(key): # Period constructor will cast to string, which we dont want raise KeyError(key) try: key = Period(key, freq=self.freq) except ValueError as err: # we cannot construct the Period raise KeyError(orig_key) from err try: return Index.get_loc(self, key, method, tolerance) except KeyError as err: raise KeyError(orig_key) from err
def read_excel(io, sheet_name=0, header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skip_footer=0, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: warnings.warn( "The `sheetname` keyword is deprecated, use " "`sheet_name` instead", FutureWarning, stacklevel=2) sheet_name = kwds.pop("sheetname") if 'sheet' in kwds: raise TypeError("read_excel() got an unexpected keyword argument " "`sheet`") if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) return io.parse(sheet_name=sheet_name, header=header, names=names, index_col=index_col, usecols=usecols, squeeze=squeeze, dtype=dtype, converters=converters, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, keep_default_na=keep_default_na, verbose=verbose, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, **kwds)
def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. The result is replaced with the first N elements of other, where N is the number of True values in mask. If the length of other is shorter than N, other will be repeated. Parameters ---------- result : ndarray The destination array. This will be mutated in-place if no upcasting is necessary. mask : boolean ndarray other : scalar The source value. Returns ------- result : ndarray changed : bool Set to true if the result array was upcasted. Examples -------- >>> result, _ = maybe_upcast_putmask(np.arange(1,6), np.array([False, True, False, True, True]), np.arange(21,23)) >>> result array([1, 21, 3, 22, 21]) """ if not isinstance(result, np.ndarray): raise ValueError("The result input must be a ndarray.") if not is_scalar(other): # We _could_ support non-scalar other, but until we have a compelling # use case, we assume away the possibility. raise ValueError("other must be a scalar") if mask.any(): # Two conversions for date-like dtypes that can't be done automatically # in np.place: # NaN -> NaT # integer or integer array -> date-like array if result.dtype.kind in ["m", "M"]: if is_scalar(other): if isna(other): other = result.dtype.type("nat") elif is_integer(other): other = np.array(other, dtype=result.dtype) elif is_integer_dtype(other): other = np.array(other, dtype=result.dtype) def changeit(): # try to directly set by expanding our array to full # length of the boolean try: om = other[mask] except (IndexError, TypeError): # IndexError occurs in test_upcast when we have a boolean # mask of the wrong shape # TypeError occurs in test_upcast when `other` is a bool pass else: om_at = om.astype(result.dtype) if (om == om_at).all(): new_result = result.values.copy() new_result[mask] = om_at result[:] = new_result return result, False # we are forced to change the dtype of the result as the input # isn't compatible r, _ = maybe_upcast(result, fill_value=other, copy=True) np.place(r, mask, other) return r, True # we want to decide whether place will work # if we have nans in the False portion of our mask then we need to # upcast (possibly), otherwise we DON't want to upcast (e.g. if we # have values, say integers, in the success portion then it's ok to not # upcast) new_dtype, _ = maybe_promote(result.dtype, other) if new_dtype != result.dtype: # we have a scalar or len 0 ndarray # and its nan and we are changing some values if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1): if isna(other): return changeit() # we have an ndarray and the masking has nans in it else: if isna(other).any(): return changeit() try: np.place(result, mask, other) except TypeError: # e.g. int-dtype result and float-dtype other return changeit() return result, False
def __init__(self, kwds): self.names = kwds.get("names") self.orig_names: Optional[List] = None self.prefix = kwds.pop("prefix", None) self.index_col = kwds.get("index_col", None) self.unnamed_cols: Set = set() self.index_names: Optional[List] = None self.col_names = None self.parse_dates = _validate_parse_dates_arg( kwds.pop("parse_dates", False)) self.date_parser = kwds.pop("date_parser", None) self.dayfirst = kwds.pop("dayfirst", False) self.keep_date_col = kwds.pop("keep_date_col", False) self.na_values = kwds.get("na_values") self.na_fvalues = kwds.get("na_fvalues") self.na_filter = kwds.get("na_filter", False) self.keep_default_na = kwds.get("keep_default_na", True) self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, infer_datetime_format=self.infer_datetime_format, cache_dates=self.cache_dates, ) # validate header options for mi self.header = kwds.get("header") if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") if any(i < 0 for i in self.header): raise ValueError( "cannot specify multi-index header with negative integers") if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" ) if kwds.get("names"): raise ValueError( "cannot specify names when specifying a multi-index header" ) # validate index_col that only contains integers if self.index_col is not None: is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not (is_sequence and all(map(is_integer, self.index_col)) or is_integer(self.index_col)): raise ValueError("index_col must only contain row numbers " "when specifying a multi-index header") elif self.header is not None: # GH 27394 if self.prefix is not None: raise ValueError( "Argument prefix must be None if argument header is not None" ) # GH 16338 elif not is_integer(self.header): raise ValueError("header must be integer or list of integers") # GH 27779 elif self.header < 0: raise ValueError( "Passing negative integer to header is invalid. " "For no header, use header=None instead") self._name_processed = False self._first_chunk = True self.usecols, self.usecols_dtype = self._validate_usecols_arg( kwds["usecols"]) self.handles: Optional[IOHandles] = None
def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ Return a fixed frequency IntervalIndex. Parameters ---------- start : numeric or datetime-like, default None Left bound for generating intervals. end : numeric or datetime-like, default None Right bound for generating intervals. periods : int, default None Number of periods to generate. freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : str, default None Name of the resulting IntervalIndex. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- IntervalIndex See Also -------- IntervalIndex : An Index of intervals that are all closed on the same side. Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, exactly three must be specified. If ``freq`` is omitted, the resulting ``IntervalIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. Examples -------- Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], closed='right', dtype='interval[int64]') Additionally, datetime-like input is also supported. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]], closed='right', dtype='interval[datetime64[ns]]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For numeric ``start`` and ``end``, the frequency must also be numeric. >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]], closed='right', dtype='interval[datetime64[ns]]') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). >>> pd.interval_range(start=0, end=6, periods=4) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ start = com.maybe_box_datetimelike(start) end = com.maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): freq = 1 if is_number(endpoint) else "D" if com.count_not_none(start, end, periods, freq) != 3: raise ValueError( "Of the four parameters: start, end, periods, and " "freq, exactly three must be specified" ) if not _is_valid_endpoint(start): raise ValueError(f"start must be numeric or datetime-like, got {start}") elif not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: raise TypeError(f"periods must be a number, got {periods}") if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError as err: raise ValueError( f"freq must be numeric or convertible to DateOffset, got {freq}" ) from err # verify type compatibility if not all( [ _is_type_compatible(start, end), _is_type_compatible(start, freq), _is_type_compatible(end, freq), ] ): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) if periods is not None: periods += 1 if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com.all_not_none(start, end, freq): end -= (end - start) % freq # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 elif start is None: start = end - (periods - 1) * freq elif end is None: end = start + (periods - 1) * freq breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output breaks = maybe_downcast_to_dtype(breaks, "int64") else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): range_func = date_range else: range_func = timedelta_range breaks = range_func(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: """Set one or more values inplace. Parameters ---------- key : int, ndarray, or slice When called from, e.g. ``Series.__setitem__``, ``key`` will be one of * scalar int * ndarray of integers. * boolean ndarray * slice object value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object value or values to be set of ``key``. Returns ------- None """ key = check_array_indexer(self, key) if is_integer(key): key = cast(int, key) if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") # Slice data and insert in-between new_data = [ *self._data[0:key].chunks, pa.array([value], type=pa.string()), *self._data[(key + 1):].chunks, ] self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. # This is probably extremely slow. # Convert all possible input key types to an array of integers if isinstance(key, slice): key_array = np.array(range(len(self))[key]) elif is_bool_dtype(key): # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() else: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) if len(key_array) != len(value): raise ValueError("Length of indexer and values mismatch") for k, v in zip(key_array, value): self[k] = v
def __getitem__( self: ArrowStringArray, item: PositionalIndexer ) -> ArrowStringArray | ArrowStringScalarOrNAT: """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ item = check_array_indexer(self, item) if isinstance(item, np.ndarray): if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): return type(self)(self._data.filter(item)) else: raise IndexError("Only integers, slices and integer or " "boolean arrays are valid indices.") elif isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) # error: Non-overlapping identity check (left operand type: # "Union[Union[int, integer[Any]], Union[slice, List[int], # ndarray[Any, Any]]]", right operand type: "ellipsis") if item is Ellipsis: # type: ignore[comparison-overlap] # TODO: should be handled by pyarrow? item = slice(None) if is_scalar(item) and not is_integer(item): # e.g. "foo" or 2.5 # exception message copied from numpy raise IndexError( r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " r"(`None`) and integer or boolean arrays are valid indices") # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. value = self._data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return self._as_pandas_scalar(value)
def qcut( x, q, labels=None, retbins: bool = False, precision: int = 3, duplicates: str = "raise", ): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example 1000 values for 10 quantiles would produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : 1d ndarray or Series q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. precision : int, optional The precision at which to store and display the bins labels. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. Returns ------- out : Categorical or Series or array of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if input is a Series else Categorical. Bins are represented as categories when categorical data is returned. bins : ndarray of floats Returned only if `retbins` is True. Notes ----- Out of bounds values will be NA in the resulting Categorical object Examples -------- >>> pd.qcut(range(5), 4) ... # doctest: +ELLIPSIS [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ... >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3]) """ original = x x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts( x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates, ) return _postprocess_for_cut(fac, bins, retbins, dtype, original)
def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, closed=None, verify_integrity=True, **kwargs): if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: return data.copy() else: return data._shallow_copy() freq_infer = False if not isinstance(freq, DateOffset): # if a passed freq is None, don't infer automatically if freq != 'infer': freq = to_offset(freq) else: freq_infer = True freq = None if periods is not None: if is_float(periods): periods = int(periods) elif not is_integer(periods): msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if data is None and freq is None: raise ValueError("Must provide freq argument if no data is " "supplied") if data is None: return cls._generate(start, end, periods, name, freq, closed=closed) if unit is not None: data = to_timedelta(data, unit=unit, box=False) if not isinstance(data, (np.ndarray, Index, ABCSeries)): if is_scalar(data): raise ValueError('TimedeltaIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) # convert if not already if getattr(data, 'dtype', None) != _TD_DTYPE: data = to_timedelta(data, unit=unit, box=False) elif copy: data = np.array(data, copy=True) # check that we are matching freqs if verify_integrity and len(data) > 0: if freq is not None and not freq_infer: index = cls._simple_new(data, name=name) inferred = index.inferred_freq if inferred != freq.freqstr: on_freq = cls._generate(index[0], None, len(index), name, freq) if not np.array_equal(index.asi8, on_freq.asi8): raise ValueError('Inferred frequency {0} from passed ' 'timedeltas does not conform to ' 'passed frequency {1}'.format( inferred, freq.freqstr)) index.freq = freq return index if freq_infer: index = cls._simple_new(data, name=name) inferred = index.inferred_freq if inferred: index.freq = to_offset(inferred) return index return cls._simple_new(data, name=name, freq=freq)
def interval_range(start=None, end=None, periods=None, freq=None, name=None, closed='right'): """ Return a fixed frequency IntervalIndex Parameters ---------- start : numeric or datetime-like, default None Left bound for generating intervals end : numeric or datetime-like, default None Right bound for generating intervals periods : integer, default None Number of periods to generate freq : numeric, string, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' (calendar daily) for datetime-like. name : string, default None Name of the resulting IntervalIndex closed : string, default 'right' options are: 'left', 'right', 'both', 'neither' Notes ----- Of the three parameters: ``start``, ``end``, and ``periods``, exactly two must be specified. Returns ------- rng : IntervalIndex Examples -------- Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] closed='right', dtype='interval[int64]') Additionally, datetime-like input is also supported. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]] closed='right', dtype='interval[datetime64[ns]]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For numeric ``start`` and ``end``, the frequency must also be numeric. >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]] closed='right', dtype='interval[float64]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]] closed='right', dtype='interval[datetime64[ns]]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]] closed='both', dtype='interval[int64]') See Also -------- IntervalIndex : an Index of intervals that are all closed on the same side. """ if com._count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' 'exactly two must be specified') start = com._maybe_box_datetimelike(start) end = com._maybe_box_datetimelike(end) endpoint = next(com._not_none(start, end)) if not _is_valid_endpoint(start): msg = 'start must be numeric or datetime-like, got {start}' raise ValueError(msg.format(start=start)) if not _is_valid_endpoint(end): msg = 'end must be numeric or datetime-like, got {end}' raise ValueError(msg.format(end=end)) if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) freq = freq or (1 if is_number(endpoint) else 'D') if not is_number(freq): try: freq = to_offset(freq) except ValueError: raise ValueError('freq must be numeric or convertible to ' 'DateOffset, got {freq}'.format(freq=freq)) # verify type compatibility if not all([ _is_type_compatible(start, end), _is_type_compatible(start, freq), _is_type_compatible(end, freq) ]): raise TypeError("start, end, freq need to be type compatible") if is_number(endpoint): if periods is None: periods = int((end - start) // freq) if start is None: start = end - periods * freq # force end to be consistent with freq (lower if freq skips over end) end = start + periods * freq # end + freq for inclusive endpoint breaks = np.arange(start, end + freq, freq) elif isinstance(endpoint, Timestamp): # add one to account for interval endpoints (n breaks = n-1 intervals) if periods is not None: periods += 1 breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: # add one to account for interval endpoints (n breaks = n-1 intervals) if periods is not None: periods += 1 breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
def infer_dtype_from_scalar(val, pandas_dtype: bool = False): """ Interpret the dtype from a scalar. Parameters ---------- pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, scalar belongs to pandas extension types is inferred as object """ dtype = np.object_ # a 1-element ndarray if isinstance(val, np.ndarray): msg = "invalid ndarray passed to infer_dtype_from_scalar" if val.ndim != 0: raise ValueError(msg) dtype = val.dtype val = val.item() elif isinstance(val, str): # If we create an empty array using a string to infer # the dtype, NumPy will only allocate one character per entry # so this is kind of bad. Alternately we could use np.repeat # instead of np.empty (but then you still don't want things # coming out as np.str_! dtype = np.object_ elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) if val is tslibs.NaT or val.tz is None: dtype = np.dtype("M8[ns]") else: if pandas_dtype: dtype = DatetimeTZDtype(unit="ns", tz=val.tz) else: # return datetimetz as object return np.object_, val val = val.value elif isinstance(val, (np.timedelta64, timedelta)): val = tslibs.Timedelta(val).value dtype = np.dtype("m8[ns]") elif is_bool(val): dtype = np.bool_ elif is_integer(val): if isinstance(val, np.integer): dtype = type(val) else: dtype = np.int64 elif is_float(val): if isinstance(val, np.floating): dtype = type(val) else: dtype = np.float64 elif is_complex(val): dtype = np.complex_ elif pandas_dtype: if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) val = val.ordinal elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] dtype = IntervalDtype(subtype=subtype) return dtype, val
def _arith_method(self, other, op): """ Parameters ---------- other : Any op : callable that accepts 2 params perform the binary op """ if isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): # GH#19333 is_integer evaluated True on timedelta64, # so we need to catch these explicitly return super()._arith_method(other, op) elif is_timedelta64_dtype(other): # Must be an np.ndarray; GH#22390 return super()._arith_method(other, op) if op in [ operator.pow, ops.rpow, operator.mod, ops.rmod, operator.floordiv, ops.rfloordiv, divmod, ops.rdivmod, ]: return super()._arith_method(other, op) step: Callable | None = None if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: step = op # TODO: if other is a RangeIndex we may have more efficient options right = extract_array(other, extract_numpy=True, extract_range=True) left = self try: # apply if we have an override if step: with np.errstate(all="ignore"): rstep = step(left.step, right) # we don't have a representable op # so return a base index if not is_integer(rstep) or not rstep: raise ValueError else: rstep = left.step with np.errstate(all="ignore"): rstart = op(left.start, right) rstop = op(left.stop, right) res_name = ops.get_op_result_name(self, other) result = type(self)(rstart, rstop, rstep, name=res_name) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors if not all(is_integer(x) for x in [rstart, rstop, rstep]): result = result.astype("float64") return result except (ValueError, TypeError, ZeroDivisionError): # Defer to Int64Index implementation # test_arithmetic_explicit_conversions return super()._arith_method(other, op)
def is_int_or_none(val): return val is None or is_integer(val)
def _clean_options(self, options, engine): result = options.copy() fallback_reason = None # C engine not supported yet if engine == "c": if options["skipfooter"] > 0: fallback_reason = "the 'c' engine does not support skipfooter" engine = "python" sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ("the 'c' engine does not support " "sep=None with delim_whitespace=False") engine = "python" elif sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( "the 'c' engine does not support " "regex separators (separators > 1 char and " r"different from '\s+' are interpreted as regex)") engine = "python" elif delim_whitespace: if "python" in engine: result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" try: if len(sep.encode(encoding)) > 1: encodeable = False except UnicodeDecodeError: encodeable = False if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = (f"the separator encoded in {encoding} " "is > 1 char long, and the 'c' engine " "does not support such separators") engine = "python" quotechar = options["quotechar"] if quotechar is not None and isinstance(quotechar, (str, bytes)): if (len(quotechar) == 1 and ord(quotechar) > 127 and engine not in ("python", "python-fwf")): fallback_reason = ( "ord(quotechar) > 127, meaning the " "quotechar is larger than one byte, " "and the 'c' engine does not support such quotechars") engine = "python" if fallback_reason and self._engine_specified: raise ValueError(fallback_reason) if engine == "c": for arg in _c_unsupported: del result[arg] if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: raise ValueError( "Falling back to the 'python' engine because " f"{fallback_reason}, but this causes {repr(arg)} to be " "ignored as it is not supported by the 'python' engine." ) del result[arg] if fallback_reason: warnings.warn( ("Falling back to the 'python' engine because " f"{fallback_reason}; you can avoid this warning by specifying " "engine='python'."), ParserWarning, stacklevel=5, ) index_col = options["index_col"] names = options["names"] converters = options["converters"] na_values = options["na_values"] skiprows = options["skiprows"] validate_header_arg(options["header"]) for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] depr_default = _deprecated_defaults[arg] if result.get(arg, depr_default) != depr_default: msg = (f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n") warnings.warn(msg, FutureWarning, stacklevel=2) else: result[arg] = parser_default if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] result["index_col"] = index_col names = list(names) if names is not None else names # type conversion-related if converters is not None: if not isinstance(converters, dict): raise TypeError("Type converters must be a dict or subclass, " f"input was a {type(converters).__name__}") else: converters = {} # Converting values to NA keep_default_na = options["keep_default_na"] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers if engine != "c": if is_integer(skiprows): skiprows = list(range(skiprows)) if skiprows is None: skiprows = set() elif not callable(skiprows): skiprows = set(skiprows) # put stuff back result["names"] = names result["converters"] = converters result["na_values"] = na_values result["na_fvalues"] = na_fvalues result["skiprows"] = skiprows return result, engine
def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, dtype=None, **kwargs): if periods is not None: if is_float(periods): periods = int(periods) elif not is_integer(periods): msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if name is None and hasattr(data, 'name'): name = data.name if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): raise ValueError('dtype must be PeriodDtype') if freq is None: freq = dtype.freq elif freq != dtype.freq: msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) # coerce freq to freq object, otherwise it can be coerced elementwise # which is slow if freq: freq = Period._maybe_convert_freq(freq) if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, kwargs) return cls._from_ordinals(data, name=name, freq=freq) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq data = data._values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data._values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) # not array / index if not isinstance( data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if is_scalar(data) or isinstance(data, Period): cls._scalar_data_error(data) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) data = np.asarray(data) # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) return cls._from_ordinals(data, name=name, freq=freq) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: raise TypeError("PeriodIndex does not allow " "floating point in construction") # anything else, likely an array of strings or periods data = _ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) return cls._from_ordinals(data, name=name, freq=freq)
def test_is_integer(self): assert is_integer(1) assert is_integer(np.int64(1)) assert not is_integer(True) assert not is_integer(1.1) assert not is_integer(1 + 3j) assert not is_integer(np.bool(False)) assert not is_integer(np.bool_(False)) assert not is_integer(np.float64(1.1)) assert not is_integer(np.complex128(1 + 3j)) assert not is_integer(np.nan) assert not is_integer(None) assert not is_integer("x") assert not is_integer(datetime(2011, 1, 1)) assert not is_integer(np.datetime64("2011-01-01")) assert not is_integer(Timestamp("2011-01-01")) assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_integer(timedelta(1000)) assert not is_integer(Timedelta("1 days")) assert not is_integer(np.timedelta64(1, "D"))
def calculate_center_offset(window): if not is_integer(window): window = len(window) return int((window - 1) / 2.0)
def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label. Parameters ---------- key : Period, NaT, str, or datetime String or datetime key must be parsable as Period. Returns ------- loc : int or ndarray[int64] Raises ------ KeyError Key is not present in the index. TypeError If key is listlike or otherwise not hashable. """ orig_key = key self._check_indexing_error(key) if isinstance(key, str): try: loc = self._get_string_slice(key) return loc except (TypeError, ValueError): pass try: asdt, reso_str = parse_time_string(key, self.freq) except (ValueError, DateParseError) as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err reso = Resolution.from_attrname(reso_str) grp = reso.freq_group.value freqn = self.dtype.freq_group_code # _get_string_slice will handle cases where grp < freqn assert grp >= freqn # BusinessDay is a bit strange. It has a *lower* code, but we never parse # a string as "BusinessDay" resolution, just Day. if grp == freqn or ( reso == Resolution.RESO_DAY and self.dtype.freq.name == "B" ): key = Period(asdt, freq=self.freq) loc = self.get_loc(key, method=method, tolerance=tolerance) return loc elif method is None: raise KeyError(key) else: key = asdt elif is_integer(key): # Period constructor will cast to string, which we dont want raise KeyError(key) elif isinstance(key, Period) and key.freq != self.freq: raise KeyError(key) try: key = Period(key, freq=self.freq) except ValueError as err: # we cannot construct the Period raise KeyError(orig_key) from err try: return Index.get_loc(self, key, method, tolerance) except KeyError as err: raise KeyError(orig_key) from err
def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] return type(self)(self._data[item], self._mask[item])
def _get_empty_meta(self, columns, index_col, index_names, dtype: Optional[DtypeArg] = None): columns = list(columns) # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. if not is_dict_like(dtype): # if dtype == None, default will be object. default_dtype = dtype or object # error: Argument 1 to "defaultdict" has incompatible type "Callable[[], # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable, # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any], # Type[object]]]]" # error: Incompatible return value type (got "Union[ExtensionDtype, str, # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str, # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any], # Type[object]]") dtype = defaultdict( lambda: default_dtype # type: ignore[arg-type, return-value] ) else: dtype = cast(dict, dtype) dtype = defaultdict( lambda: object, { columns[k] if is_integer(k) else k: v for k, v in dtype.items() }, ) # Even though we have no data, the "index" of the empty DataFrame # could for example still be an empty MultiIndex. Thus, we need to # check whether we have any index columns specified, via either: # # 1) index_col (column indices) # 2) index_names (column names) # # Both must be non-null to ensure a successful construction. Otherwise, # we have to create a generic empty Index. if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = ensure_index_from_sequences(data, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n - i) col_dict = { col_name: Series([], dtype=dtype[col_name]) for col_name in columns } return index, columns, col_dict
def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. Adjust input argument to the specified origin Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be adjusted origin : 'julian' or Timestamp origin offset for the arg unit : string passed unit from to_datetime, must be 'D' Returns ------- ndarray or scalar of adjusted date(s) """ if origin == 'julian': original = arg j0 = Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) else: # arg must be numeric if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) if offset.tz is not None: raise ValueError( "origin offset {} must be tz-naive".format(offset)) offset -= Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslibs.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset return arg
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError( msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ['inside', 'outside'] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError('Invalid limit_area: expecting one of {}, got ' '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') from pandas import Series ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == 'inside': # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == 'outside': # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = [ 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima' ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[preserve_nans] = np.nan return result