def _get_object_index(self): boxed_values = _dt_box_array(self.asi8, self.offset, self.tz) return Index(boxed_values, dtype=object)
def asobject(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object)
def _get_fresh_axis(self): return Index(np.arange(len(self._get_concat_axis())))
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : deprecated na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ if order is not None: warn("order is deprecated." "See https://github.com/pydata/pandas/issues/6926", FutureWarning) from pandas.core.index import Index from pandas.core.series import Series vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), lambda x: isinstance(x,string_types) ] ]) sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, ) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj.index.name != level: raise ValueError( "level name {level} is not the name of the index". format(level=level)) elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, [key.key], obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 if isinstance(key, tuple): all_hashable = is_hashable(key) if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] # type: List[Grouping] exclusions = [] # type: List[Hashable] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): items = obj._data.items try: items.get_loc(key) except (KeyError, TypeError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def _setitem_with_indexer(self, indexer, value): self._has_valid_setitem_indexer(indexer) # also has the side effect of consolidating in-place from pandas import Panel, DataFrame, Series # maybe partial set take_split_path = self.obj._is_mixed_type if isinstance(indexer, tuple): nindexer = [] for i, idx in enumerate(indexer): if isinstance(idx, dict): # reindex the axis to the new value # and set inplace key, _ = _convert_missing_indexer(idx) # if this is the items axes, then take the main missing path # first; this correctly sets the dtype and avoids cache issues # essentially this separates out the block that is needed to possibly # be modified if self.ndim > 1 and i == self.obj._info_axis_number: # add the new item, and set the value new_indexer = _convert_from_missing_indexer_tuple( indexer) self.obj[key] = np.nan self.obj.loc[new_indexer] = value return self.obj # reindex the axis index = self.obj._get_axis(i) labels = _safe_append_to_index(index, key) self.obj._data = self.obj.reindex_axis(labels, i)._data nindexer.append(labels.get_loc(key)) else: nindexer.append(idx) indexer = tuple(nindexer) else: indexer, missing = _convert_missing_indexer(indexer) if missing: # reindex the axis to the new value # and set inplace if self.ndim == 1: index = self.obj.index if len(index) == 0: new_index = Index([indexer]) else: new_index = _safe_append_to_index(index, indexer) new_values = np.concatenate([self.obj.values, [value]]) self.obj._data = self.obj._constructor(new_values, index=new_index, name=self.obj.name) return self.obj elif self.ndim == 2: index = self.obj._get_axis(0) labels = _safe_append_to_index(index, indexer) self.obj._data = self.obj.reindex_axis(labels, 0)._data return getattr(self.obj, self.name).__setitem__(indexer, value) # set using setitem (Panel and > dims) elif self.ndim >= 3: return self.obj.__setitem__(indexer, value) # align and set the values if take_split_path: if not isinstance(indexer, tuple): indexer = self._tuplify(indexer) if isinstance(value, ABCSeries): value = self._align_series(indexer, value) info_axis = self.obj._info_axis_number info_idx = indexer[info_axis] if com.is_integer(info_idx): info_idx = [info_idx] plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:] item_labels = self.obj._get_axis(info_axis) def setter(item, v): s = self.obj[item] pi = plane_indexer[0] if len( plane_indexer) == 1 else plane_indexer # set the item, possibly having a dtype change s = s.copy() s._data = s._data.setitem(pi, v) self.obj[item] = s labels = item_labels[info_idx] if _is_list_like(value): # we have an equal len Frame if isinstance(value, ABCDataFrame) and value.ndim > 1: for item in labels: # align to if item in value: v = value[item] v = v.reindex(self.obj[item].index & v.index) setter(item, v.values) else: setter(item, np.nan) # we have an equal len ndarray to our labels elif isinstance(value, np.ndarray) and value.ndim == 2: if len(labels) != value.shape[1]: raise ValueError( 'Must have equal len keys and value when' ' setting with an ndarray') for i, item in enumerate(labels): setter(item, value[:, i]) # we have an equal len list/ndarray elif len(labels) == 1 and ( len(self.obj[labels[0]]) == len(value) or len(plane_indexer[0]) == len(value)): setter(labels[0], value) # per label values else: for item, v in zip(labels, value): setter(item, v) else: # scalar for item in labels: setter(item, value) else: if isinstance(indexer, tuple): indexer = _maybe_convert_ix(*indexer) if isinstance(value, ABCSeries): value = self._align_series(indexer, value) elif isinstance(value, ABCDataFrame): value = self._align_frame(indexer, value) if isinstance(value, ABCPanel): value = self._align_panel(indexer, value) self.obj._data = self.obj._data.setitem(indexer, value)
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = key.nonzero() return self.obj.take(inds, axis=axis, convert=False) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr): if labels.inferred_type != 'integer': keyarr = np.where(keyarr < 0, len(labels) + keyarr, keyarr) if labels.inferred_type == 'mixed-integer': indexer = labels.get_indexer(keyarr) if (indexer >= 0).all(): self.obj.take(indexer, axis=axis, convert=True) else: return self.obj.take(keyarr, axis=axis) elif not labels.inferred_type == 'integer': return self.obj.take(keyarr, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None keyarr_is_unique = Index(keyarr).is_unique # existing labels are unique and indexer is unique if labels.is_unique and keyarr_is_unique: return _reindex(keyarr, level=level) else: indexer, missing = labels.get_indexer_non_unique(keyarr) check = indexer != -1 result = self.obj.take(indexer[check], axis=axis, convert=False) # need to merge the result labels and the missing labels if len(missing): l = np.arange(len(indexer)) missing = com._ensure_platform_int(missing) missing_labels = keyarr.take(missing) missing_indexer = com._ensure_int64(l[~check]) cur_labels = result._get_axis(axis).values cur_indexer = com._ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels # a unique indexer if keyarr_is_unique: new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values new_indexer[missing_indexer] = -1 # we have a non_unique selector, need to use the original indexer here else: new_indexer = indexer # reindex with the specified axis ndim = self.obj.ndim if axis + 1 > ndim: raise AssertionError( "invalid indexing error with non-unique index") result = result._reindex_with_indexers( {axis: [new_labels, new_indexer]}, copy=True) return result
def _default_index(n): from pandas.core.index import Index return Index(np.arange(n))
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col+'-catdef'] = index._data else: d = np.empty(size, dtype=t) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col index._levels[i] = values x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) index._labels.append(d) views[col] = d views[col+'-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype=block.values.values.dtype) new_block = block.make_block_same_class( values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col+'-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = block.values.values else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def _default_index(n): from pandas.core.index import NULL_INDEX, Index if n == 0: return NULL_INDEX else: return Index(np.arange(n))
def _get_object_index(self): boxfunc = lambda x: Timestamp(x, offset=self.offset, tz=self.tz) boxed_values = lib.map_infer(self.asi8, boxfunc) return Index(boxed_values, dtype=object)
def _initDict(self, data, index, columns, objects, dtype): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Somehow this got outrageously complicated """ # pre-filter out columns if we passed it if columns is not None: colset = set(columns) data = dict((k, v) for k, v in data.iteritems() if k in colset) index = _extract_index(data, index) objectDict = {} if objects is not None and isinstance(objects, dict): objectDict.update(objects) valueDict = {} for k, v in data.iteritems(): if isinstance(v, Series): if v.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later v = v.reindex(index) else: if isinstance(v, dict): v = [v.get(i, NaN) for i in index] else: assert (len(v) == len(index)) try: v = Series(v, dtype=dtype, index=index) except Exception: v = Series(v, index=index) if issubclass(v.dtype.type, (np.bool_, float, int)): valueDict[k] = v else: objectDict[k] = v if columns is None: columns = Index(_try_sort(valueDict)) objectColumns = Index(_try_sort(objectDict)) else: objectColumns = Index([c for c in columns if c in objectDict]) columns = Index([c for c in columns if c not in objectDict]) if len(valueDict) == 0: dtype = np.object_ valueDict = objectDict columns = objectColumns else: dtypes = set(v.dtype for v in valueDict.values()) if len(dtypes) > 1: dtype = np.float_ else: dtype = list(dtypes)[0] if len(objectDict) > 0: new_objects = DataMatrix(objectDict, dtype=np.object_, index=index, columns=objectColumns) if isinstance(objects, DataMatrix): objects = objects.join(new_objects, how='left') else: objects = new_objects values = np.empty((len(index), len(columns)), dtype=dtype) for i, col in enumerate(columns): if col in valueDict: values[:, i] = valueDict[col] else: values[:, i] = np.NaN return index, columns, values, objects
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode self.encoding = encoding self.compression = compression if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and date_format is not None): self.data_index = Index([ x.strftime(date_format) if notna(x) else '' for x in self.data_index ]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def __init__( self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False, ): if not is_scalar(default_fill_value): raise ValueError("'default_fill_value' must be a scalar") warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, "name"): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = "block" self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray( self._default_fill_value, index=index, kind=self._default_kind, fill_value=self._default_fill_value, ) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' "for data argument") raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def __new__(cls, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index values = np.asarray(data) elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif np.isscalar(data): # pragma: no cover if index is None: raise Exception('must pass index!') values = np.empty(len(index)) values.fill(data) # TODO: more efficient values, sparse_index = make_sparse(values, kind=kind, fill_value=fill_value) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data assert (len(values) == sparse_index.npoints) if index is None: index = Index(np.arange(sparse_index.length)) index = _ensure_index(index) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=np.float64, copy=True) else: subarr = np.asarray(values, dtype=np.float64) if index.is_all_dates: cls = SparseTimeSeries # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = np.float64(fill_value) output.index = index output.name = name return output
def _safe_append_to_index(index, key): """ a safe append to an index, if incorrect type, then catch and recreate """ try: return index.insert(len(index), key) except: return Index(np.concatenate([index.asobject.values, np.array([key])]))
def is_monotonic(self): # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic
def _align_series(self, indexer, ser): # indexer to assign Series can be tuple or scalar if isinstance(indexer, tuple): aligners = [not _is_null_slice(idx) for idx in indexer] single_aligner = sum(aligners) == 1 is_frame = self.obj.ndim == 2 is_panel = self.obj.ndim >= 3 # are we a single alignable value on a non-primary # dim (e.g. panel: 1,2, or frame: 0) ? # hence need to align to a single axis dimension # rather that find all valid dims # frame if is_frame: single_aligner = single_aligner and aligners[0] # panel elif is_panel: single_aligner = single_aligner and (aligners[1] or aligners[2]) obj = self.obj for i, idx in enumerate(indexer): ax = obj.axes[i] # multiple aligners (or null slices) if com._is_sequence(idx) or isinstance(idx, slice): if single_aligner and _is_null_slice(idx): continue new_ix = ax[idx] if not is_list_like(new_ix): new_ix = Index([new_ix]) if ser.index.equals(new_ix): return ser.values.copy() return ser.reindex(new_ix).values # 2 dims elif single_aligner and is_frame: # reindex along index ax = self.obj.axes[1] if ser.index.equals(ax): return ser.values.copy() return ser.reindex(ax).values # >2 dims elif single_aligner: broadcast = [] for n, labels in enumerate(self.obj._get_plane_axes(i)): # reindex along the matching dimensions if len(labels & ser.index): ser = ser.reindex(labels) else: broadcast.append((n, len(labels))) # broadcast along other dims ser = ser.values.copy() for (axis, l) in broadcast: shape = [-1] * (len(broadcast) + 1) shape[axis] = l ser = np.tile(ser, l).reshape(shape) if self.obj.ndim == 3: ser = ser.T return ser elif np.isscalar(indexer): ax = self.obj._get_axis(1) if ser.index.equals(ax): return ser.values.copy() return ser.reindex(ax).values raise ValueError('Incompatible indexer with Series')
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = _ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: values = _possibly_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def aggregate(self, func_or_funcs, *args, **kwargs): """ Apply aggregation function or functions to groups, yielding most likely Series but in some cases DataFrame depending on the output of the aggregation function Parameters ---------- func_or_funcs : function or list / dict of functions List/dict of functions will produce DataFrame with column names determined by the function names themselves (list) or the keys in the dict Notes ----- agg is an alias for aggregate. Use it. Example ------- >>> series bar 1.0 baz 2.0 qot 3.0 qux 4.0 >>> mapper = lambda x: x[0] # first letter >>> grouped = series.groupby(mapper) >>> grouped.aggregate(np.sum) b 3.0 q 7.0 >>> grouped.aggregate([np.sum, np.mean, np.std]) mean std sum b 1.5 0.5 3 q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 See also -------- apply, transform Returns ------- Series or DataFrame """ if isinstance(func_or_funcs, basestring): return getattr(self, func_or_funcs)(*args, **kwargs) if hasattr(func_or_funcs, '__iter__'): ret = self._aggregate_multiple_funcs(func_or_funcs) else: if len(self.groupings) > 1: return self._python_agg_general(func_or_funcs, *args, **kwargs) try: return self._python_agg_general(func_or_funcs, *args, **kwargs) except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) index = Index(sorted(result), name=self.groupings[0].name) ret = Series(result, index=index) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' return ret
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) if not self._has_complex_date_col and self.index_col is not None: index = self._get_simple_index(zipped_content) index = self._agg_index(index) else: index = Index(np.arange(len(content))) col_len, zip_len = len(self.columns), len(zipped_content) if col_len != zip_len: row_num = -1 for (i, l) in enumerate(content): if len(l) != col_len: break footers = 0 if self.skip_footer: footers = self.skip_footer row_num = self.pos - (len(content) - i + footers) msg = ('Expecting %d columns, got %d in row %d' % (col_len, zip_len, row_num)) raise ValueError(msg) data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) columns = list(self.columns) if self.parse_dates is not None: data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) df = DataFrame(data=data, columns=columns, index=index) if self._has_complex_date_col and self.index_col is not None: if not self._name_processed: self.index_name = self._get_index_name(list(columns)) self._name_processed = True data = dict(((k, v) for k, v in df.iteritems())) index = self._get_complex_date_index(data, col_names=columns, parse_dates=False) index = self._agg_index(index, False) data = dict(((k, v.values) for k, v in data.iteritems())) df = DataFrame(data=data, columns=columns, index=index) if self.squeeze and len(df.columns) == 1: return df[df.columns[0]] return df
def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ assert obj is not None if self.key is not None and self.level is not None: raise ValueError( "The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError( "The grouper name {key} is not found".format(key=key)) ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) if self.level is not None: level = self.level # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): raise ValueError( "The level {level} is not valid".format( level=level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis, is_copy=False) self.obj = obj self.grouper = ax return self.grouper
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[str(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: d = Series(d).dt.tz_localize(timezones[str(col)]) df[str(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col + '-catdef'] = index._data else: d = np.empty(size, dtype=t) # if d.dtype.kind == "M" and str(col) in timezones: # d = Series(d).dt.tz_localize(timezones[str(col)]) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() for i, col in enumerate(index_names): if str(index_types[i]) == 'category': c = Categorical([], categories=cat(col), fastpath=True) z = CategoricalIndex(c) z._data._codes = c.categories._data z._set_categories = c._set_categories index._levels.append(z) vals = np.zeros(size, dtype=c.codes.dtype) index._labels.append(vals) views[col] = index._labels[i] views[col + '-catdef'] = index._levels[i] else: d = np.empty(size, dtype=index_types[i]) # if d.dtype.kind == "M" and str(col) in timezones: # d = Series(d).dt.tz_localize(timezones[str(col)]) index._levels.append(Index(d)) index._labels.append(np.arange(size, dtype=int)) views[col] = index._levels[i]._data axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype=block.values.values.dtype) new_block = block.make_block_same_class(values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col + '-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = block.values.values else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = _ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) generic.NDFrame.__init__(self, mgr)
def match(needles, haystack): haystack = Index(haystack) needles = Index(needles) return haystack.get_indexer(needles)
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name='__placeholder__') else: dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def form_blocks(data, axes): # pre-filter out items if we passed it items = axes[0] if len(data) < len(items): extra_items = items - Index(data.keys()) else: extra_items = [] # put "leftover" items in float bucket, where else? # generalize? float_dict = {} complex_dict = {} int_dict = {} bool_dict = {} object_dict = {} datetime_dict = {} for k, v in data.iteritems(): if issubclass(v.dtype.type, np.floating): float_dict[k] = v elif issubclass(v.dtype.type, np.complexfloating): complex_dict[k] = v elif issubclass(v.dtype.type, np.datetime64): datetime_dict[k] = v elif issubclass(v.dtype.type, np.integer): int_dict[k] = v elif v.dtype == np.bool_: bool_dict[k] = v else: object_dict[k] = v blocks = [] if len(float_dict): float_block = _simple_blockify(float_dict, items, np.float64) blocks.append(float_block) if len(complex_dict): complex_block = _simple_blockify(complex_dict, items, np.complex128) blocks.append(complex_block) if len(int_dict): int_block = _simple_blockify(int_dict, items, np.int64) blocks.append(int_block) if len(datetime_dict): datetime_block = _simple_blockify(datetime_dict, items, np.dtype('M8[ns]')) blocks.append(datetime_block) if len(bool_dict): bool_block = _simple_blockify(bool_dict, items, np.bool_) blocks.append(bool_block) if len(object_dict) > 0: object_block = _simple_blockify(object_dict, items, np.object_) blocks.append(object_block) if len(extra_items): shape = (len(extra_items), ) + tuple(len(x) for x in axes[1:]) block_values = np.empty(shape, dtype=float) block_values.fill(nan) na_block = make_block(block_values, extra_items, items, do_integrity_check=True) blocks.append(na_block) blocks = _consolidate(blocks, items) return blocks
def form_blocks(arrays, names, axes): # pre-filter out items if we passed it items = axes[0] if len(arrays) < len(items): extra_items = items - Index(names) else: extra_items = [] # put "leftover" items in float bucket, where else? # generalize? float_items = [] complex_items = [] int_items = [] bool_items = [] object_items = [] datetime_items = [] for k, v in zip(names, arrays): if issubclass(v.dtype.type, np.floating): float_items.append((k, v)) elif issubclass(v.dtype.type, np.complexfloating): complex_items.append((k, v)) elif issubclass(v.dtype.type, np.datetime64): if v.dtype != _NS_DTYPE: v = tslib.cast_to_nanoseconds(v) if hasattr(v, 'tz') and v.tz is not None: object_items.append((k, v)) else: datetime_items.append((k, v)) elif issubclass(v.dtype.type, np.integer): if v.dtype == np.uint64: # HACK #2355 definite overflow if (v > 2**63 - 1).any(): object_items.append((k, v)) continue int_items.append((k, v)) elif v.dtype == np.bool_: bool_items.append((k, v)) else: object_items.append((k, v)) blocks = [] if len(float_items): float_block = _simple_blockify(float_items, items, np.float64) blocks.append(float_block) if len(complex_items): complex_block = _simple_blockify(complex_items, items, np.complex128) blocks.append(complex_block) if len(int_items): int_block = _simple_blockify(int_items, items, np.int64) blocks.append(int_block) if len(datetime_items): datetime_block = _simple_blockify(datetime_items, items, _NS_DTYPE) blocks.append(datetime_block) if len(bool_items): bool_block = _simple_blockify(bool_items, items, np.bool_) blocks.append(bool_block) if len(object_items) > 0: object_block = _simple_blockify(object_items, items, np.object_) blocks.append(object_block) if len(extra_items): shape = (len(extra_items), ) + tuple(len(x) for x in axes[1:]) # empty items -> dtype object block_values = np.empty(shape, dtype=object) block_values.fill(nan) na_block = make_block(block_values, extra_items, items) blocks.append(na_block) blocks = _consolidate(blocks, items) return blocks
def __setitem__(self, key, value): """ Item assignment. Raises ------ ValueError If (one or more) Value is not in categories or if a assigned `Categorical` has not the same categories """ # require identical categories set if isinstance(value, Categorical): if not value.categories.equals(self.categories): raise ValueError("Cannot set a Categorical with another, without identical " "categories") rvalue = value if com.is_list_like(value) else [value] to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set something to np.nan if len(to_add) and not isnull(to_add).all(): raise ValueError("cannot setitem on a Categorical with a new category," " set the categories first") # set by position if isinstance(key, (int, np.integer)): pass # tuple of indexers (dataframe) elif isinstance(key, tuple): # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not _is_null_slice(key[0]): raise AssertionError("invalid slicing for a 1-ndim categorical") key = key[1] elif len(key) == 1: key = key[0] else: raise AssertionError("invalid slicing for a 1-ndim categorical") # slicing in Series or Categorical elif isinstance(key, slice): pass # Array of True/False in Series or Categorical else: # There is a bug in numpy, which does not accept a Series as a indexer # https://github.com/pydata/pandas/issues/6168 # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 # FIXME: remove when numpy 1.9 is the lowest numpy version pandas accepts... key = np.asarray(key) lindexer = self.categories.get_indexer(rvalue) # FIXME: the following can be removed after https://github.com/pydata/pandas/issues/7820 # is fixed. # float categories do currently return -1 for np.nan, even if np.nan is included in the # index -> "repair" this here if isnull(rvalue).any() and isnull(self.categories).any(): nan_pos = np.where(com.isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos self._codes[key] = lindexer