def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b)
def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b)
def cartesian_product(X): """ Numpy version of itertools.product or pandas.compat.product. Sometimes faster (for large inputs)... Parameters ---------- X : list-like of list-likes Returns ------- product : list of ndarrays Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] See also -------- itertools.product : Cartesian product of input iterables. Equivalent to nested for-loops. pandas.compat.product : An alias for itertools.product. """ msg = "Input must be a list-like of list-likes" if not is_list_like(X): raise TypeError(msg) for x in X: if not is_list_like(x): raise TypeError(msg) if len(X) == 0: return [] lenX = np.fromiter((len(x) for x in X), dtype=int) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) a[0] = 1 if cumprodX[-1] != 0: b = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) return [ np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]), np.product(a[i])) for i, x in enumerate(X) ]
def cartesian_product(X): """ Numpy version of itertools.product or pandas.compat.product. Sometimes faster (for large inputs)... Parameters ---------- X : list-like of list-likes Returns ------- product : list of ndarrays Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] See also -------- itertools.product : Cartesian product of input iterables. Equivalent to nested for-loops. pandas.compat.product : An alias for itertools.product. """ msg = "Input must be a list-like of list-likes" if not is_list_like(X): raise TypeError(msg) for x in X: if not is_list_like(x): raise TypeError(msg) if len(X) == 0: return [] lenX = np.fromiter((len(x) for x in X), dtype=int) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) a[0] = 1 if cumprodX[-1] != 0: b = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]), np.product(a[i])) for i, x in enumerate(X)]
def isin(comps, values): """ Compute the isin boolean array Parameters ---------- comps: array-like values: array-like Returns ------- boolean array same length as comps """ if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) if not isinstance(values, np.ndarray): values = list(values) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) return f(comps, values)
def isin(comps, values): """ Compute the isin boolean array Parameters ---------- comps: array-like values: array-like Returns ------- boolean array same length as comps """ if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) if not isinstance(values, np.ndarray): values = list(values) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) return f(comps, values)
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result result = np.asarray(result) # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def _is_offset(self, arr_or_obj): """ check if obj or all elements of list-like is DateOffset """ if isinstance(arr_or_obj, pd.DateOffset): return True elif is_list_like(arr_or_obj) and len(arr_or_obj): return all(isinstance(x, pd.DateOffset) for x in arr_or_obj) return False
def _evaluate_compare(self, other, op): """ We have been called because a comparison between 8 aware arrays. numpy >= 1.11 will now warn about NaT comparisons """ # coerce to a similar object if not isinstance(other, type(self)): if not is_list_like(other): # scalar other = [other] elif is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly mask = (self._isnan) | (other._isnan) if is_bool_dtype(result): result[mask] = False return result try: result[mask] = tslib.iNaT return Index(result) except TypeError: return result
def _evaluate_compare(self, other, op): """ We have been called because a comparison between 8 aware arrays. numpy >= 1.11 will now warn about NaT comparisons """ # coerce to a similar object if not isinstance(other, type(self)): if not is_list_like(other): # scalar other = [other] elif is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly mask = (self._isnan) | (other._isnan) if is_bool_dtype(result): result[mask] = False return result try: result[mask] = tslib.iNaT return Index(result) except TypeError: return result
def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is tslib.NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) if isnull(other): result.fill(nat_result) else: if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == tslib.iNaT else: o_mask = other.view('i8') == tslib.iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex( self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def conform(self, rhs): """ inplace conform rhs """ if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() return rhs
def delete(self, loc): """ Make a new DatetimeIndex with passed location(s) deleted. Parameters ---------- loc: int, slice or array of ints Indicate which sub-arrays to remove. Returns ------- new_index : TimedeltaIndex """ new_tds = np.delete(self.asi8, loc) freq = 'infer' if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if is_list_like(loc): loc = lib.maybe_indices_to_slice( _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq)
def conform(self, rhs): """ inplace conform rhs """ if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() return rhs
def _get_skiprows(skiprows): """Get an iterator given an integer, slice or container. Parameters ---------- skiprows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError * If `skiprows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 raise TypeError('%r is not a valid type for skipping rows' % type(skiprows).__name__)
def _gotitem(self, key, ndim, subset=None): """ sub-classes to define return a sliced object Parameters ---------- key : string / list of selections ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: subset = self.obj # we need to make a shallow copy of ourselves # with the same groupby kwargs = dict([(attr, getattr(self, attr)) for attr in self._attributes]) self = self.__class__(subset, groupby=self._groupby[key], parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self
def _sanitize_values(arr): """ return an ndarray for our input, in a platform independent manner """ if hasattr(arr, 'values'): arr = arr.values else: # scalar if is_scalar(arr): arr = [arr] # ndarray if isinstance(arr, np.ndarray): pass elif is_list_like(arr) and len(arr) > 0: arr = _possibly_convert_platform(arr) else: arr = np.asarray(arr) return arr
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def _get_skiprows(skiprows): """Get an iterator given an integer, slice or container. Parameters ---------- skiprows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError * If `skiprows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 raise TypeError('%r is not a valid type for skipping rows' % type(skiprows).__name__)
def _gotitem(self, key, ndim, subset=None): """ sub-classes to define return a sliced object Parameters ---------- key : string / list of selections ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: subset = self.obj # we need to make a shallow copy of ourselves # with the same groupby kwargs = dict([(attr, getattr(self, attr)) for attr in self._attributes]) self = self.__class__(subset, groupby=self._groupby[key], parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self
def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name)
def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name)
def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode)))
def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode)))
def _not_in(x, y): """Compute the vectorized membership of ``x not in y`` if possible, otherwise use Python. """ try: return ~x.isin(y) except AttributeError: if is_list_like(x): try: return ~y.isin(x) except AttributeError: pass return x not in y
def _not_in(x, y): """Compute the vectorized membership of ``x not in y`` if possible, otherwise use Python. """ try: return ~x.isin(y) except AttributeError: if is_list_like(x): try: return ~y.isin(x) except AttributeError: pass return x not in y
def _ixs(self, i, axis=0): """ for compat as we don't support Block Manager here i : int, slice, or sequence of integers axis : int """ key = self._get_axis(axis)[i] # xs cannot handle a non-scalar key, so just reindex here if is_list_like(key): return self.reindex(**{self._get_axis_name(axis): key}) return self.xs(key, axis=axis)
def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently .. versionadded:: 0.20.0 Parameters ---------- vals : MultiIndex, list-of-tuples, or single tuple encoding : string, default 'utf8' hash_key : string key to encode, default to _default_hash_key Returns ------- ndarray of hashed values array """ is_tuple = False if isinstance(vals, tuple): vals = [vals] is_tuple = True elif not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) # create a list-of-ndarrays def get_level_values(num): unique = vals.levels[num] # .values labels = vals.labels[num] filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) return filled vals = [get_level_values(level) for level in range(vals.nlevels)] # hash the list-of-ndarrays hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) for l in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] return h
def _delegate_method(self, name, *args, **kwargs): from pandas import Series method = getattr(self.values, name) result = method(*args, **kwargs) if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a method of a datetimelike object " "are not supported and are discarded. Change " "values on the original.") return result
def _delegate_method(self, name, *args, **kwargs): from pandas import Series method = getattr(self.values, name) result = method(*args, **kwargs) if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a method of a datetimelike object " "are not supported and are discarded. Change " "values on the original.") return result
def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently .. versionadded:: 0.20.0 Parameters ---------- vals : MultiIndex, list-of-tuples, or single tuple encoding : string, default 'utf8' hash_key : string key to encode, default to _default_hash_key Returns ------- ndarray of hashed values array """ is_tuple = False if isinstance(vals, tuple): vals = [vals] is_tuple = True elif not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals vals = [Categorical(vals.labels[level], vals.levels[level], ordered=False, fastpath=True) for level in range(vals.nlevels)] # hash the list-of-ndarrays hashes = (_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] return h
def hash_tuples(vals, encoding='utf8', hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently .. versionadded:: 0.20.0 Parameters ---------- vals : MultiIndex, list-of-tuples, or single tuple encoding : string, default 'utf8' hash_key : string key to encode, default to _default_hash_key Returns ------- ndarray of hashed values array """ is_tuple = False if isinstance(vals, tuple): vals = [vals] is_tuple = True elif not is_list_like(vals): raise TypeError("must be convertible to a list-of-tuples") if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals vals = [ Categorical(vals.labels[level], vals.levels[level], ordered=False, fastpath=True) for level in range(vals.nlevels) ] # hash the list-of-ndarrays hashes = (_hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] return h
def select_n_frame(frame, columns, n, method, keep): """Implement n largest/smallest for pandas DataFrame Parameters ---------- frame : pandas.DataFrame object columns : list or str n : int keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} Returns ------- nordered : DataFrame """ from pandas.core.series import Series if not is_list_like(columns): columns = [columns] columns = list(columns) ser = getattr(frame[columns[0]], method)(n, keep=keep) if isinstance(ser, Series): ser = ser.to_frame() return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
def select_n_frame(frame, columns, n, method, keep): """Implement n largest/smallest for pandas DataFrame Parameters ---------- frame : pandas.DataFrame object columns : list or str n : int keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} Returns ------- nordered : DataFrame """ from pandas.core.series import Series if not is_list_like(columns): columns = [columns] columns = list(columns) ser = getattr(frame[columns[0]], method)(n, keep=keep) if isinstance(ser, Series): ser = ser.to_frame() return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
def get_value(self, series, key): if com.is_bool_indexer(key): loc = key elif is_list_like(key): loc = self.get_indexer(key) elif isinstance(key, slice): if not (key.step is None or key.step == 1): raise ValueError("cannot support not-default " "step in a slice") try: loc = self.get_loc(key) except TypeError: # we didn't find exact intervals # or are non-unique raise ValueError("unable to slice with " "this key: {}".format(key)) else: loc = self.get_loc(key) return series.iloc[loc]
def _validate_where(w): """ Validate that the where statement is of the right type. The type may either be String, Expr, or list-like of Exprs. Parameters ---------- w : String term expression, Expr, or list-like of Exprs. Returns ------- where : The original where clause if the check was successful. Raises ------ TypeError : An invalid data type was passed in for w (e.g. dict). """ if not (isinstance(w, (Expr, string_types)) or is_list_like(w)): raise TypeError("where must be passed as a string, Expr, " "or list-like of Exprs") return w
def get_data_fred(name, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """ Get data for the given name from the St. Louis FED (FRED). Date format is datetime Returns a DataFrame. If multiple names are passed for "series" then the index of the DataFrame is the outer join of the indicies of each series. """ start, end = _sanitize_dates(start, end) if not is_list_like(name): names = [name] else: names = name urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for n in names] def fetch_data(url, name): with urlopen(url) as resp: data = read_csv(resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values='.') try: return data.truncate(start, end) except KeyError: if data.ix[3].name[7:12] == 'Error': raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name)) raise df = concat([fetch_data(url, n) for url, n in zip(urls, names)], axis=1, join='outer') return df
def _validate_where(w): """ Validate that the where statement is of the right type. The type may either be String, Expr, or list-like of Exprs. Parameters ---------- w : String term expression, Expr, or list-like of Exprs. Returns ------- where : The original where clause if the check was successful. Raises ------ TypeError : An invalid data type was passed in for w (e.g. dict). """ if not (isinstance(w, (Expr, string_types)) or is_list_like(w)): raise TypeError("where must be passed as a string, Expr, " "or list-like of Exprs") return w
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame See also -------- DataFrame.pivot : pivot without aggregation that can handle non-numeric data """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] or i for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notnull().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn(("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCSeries) for r in compat.itervalues(result) ]) def is_any_frame(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCDataFrame) for r in compat.itervalues(result) ]) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [agged.index.names[i] or i for i in range(len(index), len(keys))] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notnull().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, freq=None, infer_datetime_format=False): """ Same as to_datetime, but accept freq for DatetimeIndex internal construction """ from pandas.tseries.index import DatetimeIndex def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime( arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, tslib.Timestamp): return arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0]
def to_timedelta(arg, unit='ns', box=True, errors='raise', coerce=None): """ Convert argument to timedelta Parameters ---------- arg : string, timedelta, list, tuple, 1-d array, or Series unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number box : boolean, default True - If True returns a Timedelta/TimedeltaIndex of the results - if False returns a np.timedelta64 or ndarray of values of dtype timedelta64[ns] errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input Returns ------- ret : timedelta64/arrays of timedelta64 if parsing succeeded Examples -------- Parsing a single string to a Timedelta: >>> pd.to_timedelta('1 days 06:05:01.00003') Timedelta('1 days 06:05:01.000030') >>> pd.to_timedelta('15.5us') Timedelta('0 days 00:00:00.000015') Parsing a list or array of strings: >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015', NaT], dtype='timedelta64[ns]', freq=None) Converting numbers by specifying the `unit` keyword argument: >>> pd.to_timedelta(np.arange(5), unit='s') TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02', '00:00:03', '00:00:04'], dtype='timedelta64[ns]', freq=None) >>> pd.to_timedelta(np.arange(5), unit='d') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ unit = _validate_timedelta_unit(unit) if errors not in ('ignore', 'raise', 'coerce'): raise ValueError("errors must be one of 'ignore', " "'raise', or 'coerce'}") if arg is None: return arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: return _convert_listlike(arg, unit=unit, box=box, errors=errors) elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, timedelta, list, tuple, ' '1-d array, or Series') # ...so it must be a scalar value. Return scalar. return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors)
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): r""" Wide panel to long format. Less flexible but more user-friendly than melt. With stubnames ['A', 'B'], this function expects to find one or more group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... You specify what you want to call this suffix in the resulting long format with `j` (for example `j='year'`) Each row of these wide variables are assumed to be uniquely identified by `i` (can be a single column name or a list of column names) All remaining variables in the data frame are left intact. Parameters ---------- df : DataFrame The wide-format DataFrame stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. i : str or list-like Column(s) to use as id variable(s) j : str The name of the subobservation variable. What you wish to name your suffix in the long format. sep : str, default "" A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hypen by specifying `sep='-'` .. versionadded:: 0.20.0 suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form Aone, Btwo,.., and you have an unrelated column Arating, you can ignore the last one by specifying `suffix='(!?one|two)'` .. versionadded:: 0.20.0 Returns ------- DataFrame A DataFrame that contains each stub name as a variable, with new index (i, j) Examples -------- >>> import pandas as pd >>> import numpy as np >>> np.random.seed(123) >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, ... "X" : dict(zip(range(3), np.random.randn(3))) ... }) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") X A B id year 0 1970 -1.085631 a 2.5 1 1970 0.997345 b 1.2 2 1970 0.282978 c 0.7 0 1980 -1.085631 d 3.2 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 With multuple id columns >>> df = pd.DataFrame({ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] ... }) >>> df birth famid ht1 ht2 0 1 1 2.8 3.4 1 2 1 2.9 3.8 2 3 1 2.2 2.9 3 1 2 2.0 3.2 4 2 2 1.8 2.8 5 3 2 1.9 2.4 6 1 3 2.2 3.3 7 2 3 2.3 3.4 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l ht famid birth age 1 1 1 2.8 2 3.4 2 1 2.9 2 3.8 3 1 2.2 2 2.9 2 1 1 2.0 2 3.2 2 1 1.8 2 2.8 3 1 1.9 2 2.4 3 1 1 2.2 2 3.3 2 1 2.3 2 3.4 3 1 2.1 2 2.9 Going from long back to wide just takes some creative use of `unstack` >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() >>> w.columns = pd.Index(w.columns).str.join('') >>> w.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8 2 1 3 2.2 2.9 3 2 1 2.0 3.2 4 2 2 1.8 2.8 5 2 3 1.9 2.4 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 Less wieldy column names are also handled >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index >>> df A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 0 0.531828 0.724455 0.322959 0.293714 1 0.634401 0.611024 0.361789 0.630976 2 0.849432 0.722443 0.228263 0.092105 \ X id 0 0 0 1 1 1 2 2 2 >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', j='year', sep='-') X A(quarterly) B(quarterly) id year 0 2010 0 0.531828 0.322959 1 2010 2 0.634401 0.361789 2 2010 2 0.849432 0.228263 0 2011 0 0.724455 0.293714 1 2011 2 0.611024 0.630976 2 2011 2 0.722443 0.092105 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long >>> stubnames = set([match[0] for match in df.columns.str.findall('[A-B]\(.*\)').values if match != [] ]) >>> list(stubnames) ['B(quarterly)', 'A(quarterly)'] Notes ----- All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ def get_var_names(df, stub, sep, suffix): regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix) return df.filter(regex=regex).columns.tolist() def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j]) if any(map(lambda s: s in df.columns.tolist(), stubnames)): raise ValueError("stubname can't be identical to a column name") if not is_list_like(stubnames): stubnames = [stubnames] else: stubnames = list(stubnames) if not is_list_like(i): i = [i] else: i = list(i) value_vars = list( map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) melted = [] for s, v in zip(stubnames, value_vars): melted.append(melt_stub(df, s, i, j, v, sep)) melted = melted[0].join(melted[1:], how='outer') if len(i) == 1: new = df[id_vars].set_index(i).join(melted) return new new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) return new
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): r""" Wide panel to long format. Less flexible but more user-friendly than melt. With stubnames ['A', 'B'], this function expects to find one or more group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... You specify what you want to call this suffix in the resulting long format with `j` (for example `j='year'`) Each row of these wide variables are assumed to be uniquely identified by `i` (can be a single column name or a list of column names) All remaining variables in the data frame are left intact. Parameters ---------- df : DataFrame The wide-format DataFrame stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. i : str or list-like Column(s) to use as id variable(s) j : str The name of the subobservation variable. What you wish to name your suffix in the long format. sep : str, default "" A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hypen by specifying `sep='-'` .. versionadded:: 0.20.0 suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form Aone, Btwo,.., and you have an unrelated column Arating, you can ignore the last one by specifying `suffix='(!?one|two)'` .. versionadded:: 0.20.0 Returns ------- DataFrame A DataFrame that contains each stub name as a variable, with new index (i, j) Examples -------- >>> import pandas as pd >>> import numpy as np >>> np.random.seed(123) >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, ... "X" : dict(zip(range(3), np.random.randn(3))) ... }) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") X A B id year 0 1970 -1.085631 a 2.5 1 1970 0.997345 b 1.2 2 1970 0.282978 c 0.7 0 1980 -1.085631 d 3.2 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 With multuple id columns >>> df = pd.DataFrame({ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] ... }) >>> df birth famid ht1 ht2 0 1 1 2.8 3.4 1 2 1 2.9 3.8 2 3 1 2.2 2.9 3 1 2 2.0 3.2 4 2 2 1.8 2.8 5 3 2 1.9 2.4 6 1 3 2.2 3.3 7 2 3 2.3 3.4 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l ht famid birth age 1 1 1 2.8 2 3.4 2 1 2.9 2 3.8 3 1 2.2 2 2.9 2 1 1 2.0 2 3.2 2 1 1.8 2 2.8 3 1 1.9 2 2.4 3 1 1 2.2 2 3.3 2 1 2.3 2 3.4 3 1 2.1 2 2.9 Going from long back to wide just takes some creative use of `unstack` >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() >>> w.columns = pd.Index(w.columns).str.join('') >>> w.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8 2 1 3 2.2 2.9 3 2 1 2.0 3.2 4 2 2 1.8 2.8 5 2 3 1.9 2.4 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 Less wieldy column names are also handled >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index >>> df A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 0 0.531828 0.724455 0.322959 0.293714 1 0.634401 0.611024 0.361789 0.630976 2 0.849432 0.722443 0.228263 0.092105 \ X id 0 0 0 1 1 1 2 2 2 >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', j='year', sep='-') X A(quarterly) B(quarterly) id year 0 2010 0 0.531828 0.322959 1 2010 2 0.634401 0.361789 2 2010 2 0.849432 0.228263 0 2011 0 0.724455 0.293714 1 2011 2 0.611024 0.630976 2 2011 2 0.722443 0.092105 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long >>> stubnames = set([match[0] for match in df.columns.str.findall('[A-B]\(.*\)').values if match != [] ]) >>> list(stubnames) ['B(quarterly)', 'A(quarterly)'] Notes ----- All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ def get_var_names(df, stub, sep, suffix): regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix) return df.filter(regex=regex).columns.tolist() def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j]) if any(map(lambda s: s in df.columns.tolist(), stubnames)): raise ValueError("stubname can't be identical to a column name") if not is_list_like(stubnames): stubnames = [stubnames] else: stubnames = list(stubnames) if not is_list_like(i): i = [i] else: i = list(i) value_vars = list(map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) melted = [] for s, v in zip(stubnames, value_vars): melted.append(melt_stub(df, s, i, j, v, sep)) melted = melted[0].join(melted[1:], how='outer') if len(i) == 1: new = df[id_vars].set_index(i).join(melted) return new new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) return new
def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ Convert argument to timedelta Parameters ---------- arg : string, timedelta, list, tuple, 1-d array, or Series unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number box : boolean, default True - If True returns a Timedelta/TimedeltaIndex of the results - if False returns a np.timedelta64 or ndarray of values of dtype timedelta64[ns] errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input Returns ------- ret : timedelta64/arrays of timedelta64 if parsing succeeded Examples -------- Parsing a single string to a Timedelta: >>> pd.to_timedelta('1 days 06:05:01.00003') Timedelta('1 days 06:05:01.000030') >>> pd.to_timedelta('15.5us') Timedelta('0 days 00:00:00.000015') Parsing a list or array of strings: >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015', NaT], dtype='timedelta64[ns]', freq=None) Converting numbers by specifying the `unit` keyword argument: >>> pd.to_timedelta(np.arange(5), unit='s') TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02', '00:00:03', '00:00:04'], dtype='timedelta64[ns]', freq=None) >>> pd.to_timedelta(np.arange(5), unit='d') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ unit = _validate_timedelta_unit(unit) if errors not in ('ignore', 'raise', 'coerce'): raise ValueError("errors must be one of 'ignore', " "'raise', or 'coerce'}") if arg is None: return arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: return _convert_listlike(arg, unit=unit, box=box, errors=errors) elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, timedelta, list, tuple, ' '1-d array, or Series') # ...so it must be a scalar value. Return scalar. return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors)
def to_time(arg, format=None, infer_time_format=False, errors='raise'): """ Parse time strings to time objects using fixed strptime formats ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p") Use infer_time_format if all the strings are in the same format to speed up conversion. Parameters ---------- arg : string in time format, datetime.time, list, tuple, 1-d array, Series format : str, default None Format used to convert arg into a time object. If None, fixed formats are used. infer_time_format: bool, default False Infer the time format based on the first non-NaN element. If all strings are in the same format, this will speed up conversion. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as None - If 'ignore', then invalid parsing will return the input Returns ------- datetime.time """ from pandas.core.series import Series def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) times = [] if format is not None: for element in arg: try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): if errors == 'raise': raise ValueError("Cannot convert %s to a time with " "given format %s" % (element, format)) elif errors == 'ignore': return arg else: times.append(None) else: formats = _time_formats[:] format_found = False for element in arg: time_object = None for time_format in formats: try: time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) formats.insert(0, fmt) format_found = True break except (ValueError, TypeError): continue if time_object is not None: times.append(time_object) elif errors == 'raise': raise ValueError("Cannot convert arg {arg} to " "a time".format(arg=arg)) elif errors == 'ignore': return arg else: times.append(None) return times if arg is None: return arg elif isinstance(arg, time): return arg elif isinstance(arg, Series): values = _convert_listlike(arg._values, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0]
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop """ from pandas.tseries.index import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, tslib.Timestamp): return arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0]
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop """ from pandas.tseries.index import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, tslib.Timestamp): return arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0]
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") values = np.array(values, copy=False) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return _ensure_object(np.concatenate([nums, strs])) sorter = None if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = _ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = _get_data_algo(values, _hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = _ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, _ensure_platform_int(new_labels)
def to_time(arg, format=None, infer_time_format=False, errors='raise'): """ Parse time strings to time objects using fixed strptime formats ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p") Use infer_time_format if all the strings are in the same format to speed up conversion. Parameters ---------- arg : string in time format, datetime.time, list, tuple, 1-d array, Series format : str, default None Format used to convert arg into a time object. If None, fixed formats are used. infer_time_format: bool, default False Infer the time format based on the first non-NaN element. If all strings are in the same format, this will speed up conversion. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as None - If 'ignore', then invalid parsing will return the input Returns ------- datetime.time """ from pandas.core.series import Series def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) times = [] if format is not None: for element in arg: try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): if errors == 'raise': raise ValueError("Cannot convert %s to a time with " "given format %s" % (element, format)) elif errors == 'ignore': return arg else: times.append(None) else: formats = _time_formats[:] format_found = False for element in arg: time_object = None for time_format in formats: try: time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) formats.insert(0, fmt) format_found = True break except (ValueError, TypeError): continue if time_object is not None: times.append(time_object) elif errors == 'raise': raise ValueError("Cannot convert arg {arg} to " "a time".format(arg=arg)) elif errors == 'ignore': return arg else: times.append(None) return times if arg is None: return arg elif isinstance(arg, time): return arg elif isinstance(arg, Series): values = _convert_listlike(arg._values, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0]
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): """ "Unpivots" a DataFrame from wide format to long format, optionally leaving identifier variables set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. var_name : scalar Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' Name to use for the 'value' column. col_level : int or string, optional If columns are a MultiIndex then use this level to melt. See also -------- pivot_table DataFrame.pivot Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, ... 'B': {0: 1, 1: 3, 2: 5}, ... 'C': {0: 2, 1: 4, 2: 6}}) >>> df A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> pd.melt(df, id_vars=['A'], value_vars=['B']) A variable value 0 a B 1 1 b B 3 2 c B 5 >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C']) A variable value 0 a B 1 1 b B 3 2 c B 5 3 a C 2 4 b C 4 5 c C 6 The names of 'variable' and 'value' columns can be customized: >>> pd.melt(df, id_vars=['A'], value_vars=['B'], ... var_name='myVarname', value_name='myValname') A myVarname myValname 0 a B 1 1 b B 3 2 c B 5 If you have multi-index columns: >>> df.columns = [list('ABC'), list('DEF')] >>> df A B C D E F 0 a 1 2 1 b 3 4 2 c 5 6 >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B']) A variable value 0 a B 1 1 b B 3 2 c B 5 >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')]) (A, D) variable_0 variable_1 value 0 a B E 1 1 b B E 3 2 c B E 5 """ # TODO: what about the existing index? if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif (isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif (isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') else: value_vars = list(value_vars) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ 'variable_%s' % i for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else 'variable' ] if isinstance(var_name, compat.string_types): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: mdata[col] = np.tile(frame.pop(col).values, K) mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return DataFrame(mdata, columns=mcolumns)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") values = np.array(values, copy=False) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return _ensure_object(np.concatenate([nums, strs])) sorter = None if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = _ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = _get_data_algo(values, _hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = _ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, _ensure_platform_int(new_labels)
def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, compat.string_types): import matplotlib.cm as cm cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: raise ValueError("Colormap {0} is not recognized".format(cmap)) colors = lmap(colormap, np.linspace(0, 1, num=num_colors)) elif color is not None: if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't # modify the global rcParams below try: colors = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] except KeyError: colors = list( plt.rcParams.get('axes.color_cycle', list('bgrcmyk'))) if isinstance(colors, compat.string_types): colors = list(colors) elif color_type == 'random': import random def random_color(column): random.seed(column) return [random.random() for _ in range(3)] colors = lmap(random_color, lrange(num_colors)) else: raise ValueError("color_type must be either 'default' or 'random'") if isinstance(colors, compat.string_types): import matplotlib.colors conv = matplotlib.colors.ColorConverter() def _maybe_valid_colors(colors): try: [conv.to_rgba(c) for c in colors] return True except ValueError: return False # check whether the string can be convertable to single color maybe_single_color = _maybe_valid_colors([colors]) # check whether each character can be convertable to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: # Special case for single str 'CN' match and convert to hex # for supporting matplotlib < 2.0.0 if re.match(r'\AC[0-9]\Z', colors) and _mpl_ge_2_0_0(): hex_color = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] colors = [hex_color[int(colors[1])]] else: # this may no longer be required msg = ("'{0}' can be parsed as both single color and " "color cycle. Specify each color using a list " "like ['{0}'] or {1}") raise ValueError(msg.format(colors, list(colors))) elif maybe_single_color: colors = [colors] else: # ``colors`` is regarded as color cycle. # mpl will raise error any of them is invalid pass if len(colors) != num_colors: try: multiple = num_colors // len(colors) - 1 except ZeroDivisionError: raise ValueError("Invalid color argument: ''") mod = num_colors % len(colors) colors += multiple * colors colors += colors[:mod] return colors