def union_many(self, others): """ A bit of a hack to accelerate unioning a collection of indexes """ this = self for other in others: if not isinstance(this, DatetimeIndex): this = Index.union(this, other) continue if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) except TypeError: pass this, other = this._maybe_utc_convert(other) if this._can_fast_union(other): this = this._fast_union(other) else: tz = this.tz this = Index.union(this, other) if isinstance(this, DatetimeIndex): this.tz = tz if this.freq is None: this.offset = to_offset(this.inferred_freq) return this
def _reindex_columns(self, columns): if len(columns) == 0: return DataMatrix(index=self.index) if not isinstance(columns, Index): columns = Index(columns) if self.objects is not None: object_columns = columns.intersection(self.objects.columns) columns = columns - object_columns objects = self.objects._reindex_columns(object_columns) else: objects = None if len(columns) > 0 and len(self.columns) == 0: return DataMatrix(index=self.index, columns=columns, objects=objects) indexer, mask = common.get_indexer(self.columns, columns, None) mat = self.values.take(indexer, axis=1) notmask = -mask if len(mask) > 0: if notmask.any(): if issubclass(mat.dtype.type, np.int_): mat = mat.astype(float) elif issubclass(mat.dtype.type, np.bool_): mat = mat.astype(float) common.null_out_axis(mat, notmask, 1) return DataMatrix(mat, index=self.index, columns=columns, objects=objects)
def union(self, other): """ Specialized union for DateRange objects. If combine overlapping ranges with the same DateOffset, will be much faster than Index.union Parameters ---------- other : DateRange or array-like Returns ------- y : Index or DateRange """ if not isinstance(other, DateRange) or other.offset != self.offset: return Index.union(self.view(Index), other) offset = self.offset # to make our life easier, "sort" the two ranges if self[0] <= other[0]: left, right = self, other else: left, right = other, self left_end = left[-1] right_start = right[0] # Only need to "adjoin", not overlap if (left_end + offset) >= right_start: return left._fast_union(right) else: return Index.union(self, other)
def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label Returns ------- loc : int """ if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below tolerance = self._convert_tolerance(tolerance) if _is_convertible_to_td(key): key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) try: return Index.get_loc(self, key, method, tolerance) except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) except (TypeError, KeyError, ValueError): pass try: stamp = Timedelta(key) return Index.get_loc(self, stamp, method, tolerance) except (KeyError, ValueError): raise KeyError(key)
def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 inc = timedelta(hours=4) dates = Index([dt + inc for dt in self.dateIndex], name='something') formatted = dates.format(name=True) self.assert_(formatted[0] == 'something')
def rename_axis(self, mapper, axis=1): new_axis = Index([mapper(x) for x in self.axes[axis]]) new_axis._verify_integrity() new_axes = list(self.axes) new_axes[axis] = new_axis return BlockManager(self.blocks, new_axes)
def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18], dtype=object) self.assert_(outer.equals(outer2)) self.assert_(outer.equals(expected)) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') expected = Index([6, 8, 10], dtype=object) self.assert_(inner.equals(inner2)) self.assert_(inner.equals(expected)) left = self.index.join(other, how='left') self.assert_(left.equals(self.index)) left2 = other.join(self.index, how='left') self.assert_(left2.equals(other)) right = self.index.join(other, how='right') self.assert_(right.equals(other)) right2 = other.join(self.index, how='right') self.assert_(right2.equals(self.index))
def get_loc(self, key, method=None): """ Get integer location for requested label Returns ------- loc : int """ if _is_convertible_to_td(key): key = Timedelta(key) return Index.get_loc(self, key, method=method) try: return Index.get_loc(self, key, method=method) except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) except (TypeError, KeyError, ValueError): pass try: stamp = Timedelta(key) return Index.get_loc(self, stamp, method=method) except (KeyError, ValueError): raise KeyError(key)
def _make_labels(self): if self._was_factor: # pragma: no cover raise Exception('Should not call this method grouping by level') else: values = self.grouper if values.dtype != np.object_: values = values.astype('O') # khash rizer = lib.Factorizer(len(values)) labels, counts = rizer.factorize(values, sort=False) uniques = Index(rizer.uniques, name=self.name) if self.sort and len(counts) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int32) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) counts = counts.take(sorter) self._labels = labels self._group_index = uniques self._counts = counts
def test_format_datetime_with_time(self): t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) result = t.format() expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00'] self.assert_(len(result) == 2) self.assertEquals(result, expected)
def test_format(self): self._check_method_works(Index.format) index = Index([datetime.now()]) formatted = index.format() expected = str(index[0]) self.assertEquals(formatted, expected)
def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] i1 = Index(dt_dates, dtype=object) i2 = Index(["aa"], dtype=object) res = i2.intersection(i1) self.assert_(len(res) == 0)
def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = self.index.intersection(other) expected = np.sort(np.intersect1d(self.index.values, other.values)) self.assert_(np.array_equal(result, expected)) result = other.intersection(self.index) expected = np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) self.assert_(np.array_equal(result, expected))
def test_reindex_level(self): idx = Index(["one"]) target, indexer = self.index.reindex(idx, level="second") target2, indexer2 = idx.reindex(self.index, idx, level="second") exp_index = self.index.join(idx, level="second", how="left") self.assert_(target.equals(exp_index)) self.assert_(target2.equals(exp_index))
def test_reindex_level(self): idx = Index(['one']) target, indexer = self.index.reindex(idx, level='second') target2, indexer2 = idx.reindex(self.index, idx, level='second') exp_index = self.index.join(idx, level='second', how='left') self.assert_(target.equals(exp_index)) self.assert_(target2.equals(exp_index))
def intersection(self, other): """ Specialized intersection for DatetimeIndex objects. May be much faster than Index.intersection Parameters ---------- other : DatetimeIndex or array-like Returns ------- y : Index or DatetimeIndex """ if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) except TypeError: pass result = Index.intersection(self, other) if isinstance(result, DatetimeIndex): if result.freq is None: result.offset = to_offset(result.inferred_freq) return result elif ( other.offset is None or self.offset is None or other.offset != self.offset or not other.offset.isAnchored() or (not self.is_monotonic or not other.is_monotonic) ): result = Index.intersection(self, other) if isinstance(result, DatetimeIndex): if result.freq is None: result.offset = to_offset(result.inferred_freq) return result if len(self) == 0: return self if len(other) == 0: return other # to make our life easier, "sort" the two ranges if self[0] <= other[0]: left, right = self, other else: left, right = other, self end = min(left[-1], right[-1]) start = right[0] if end < start: return type(self)(data=[]) else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left.values[lslice] return self._view_like(left_chunk)
def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) self.assert_(result.equals(index)) # empty result = index.append([]) self.assert_(result.equals(index))
def test_slice_locs(self): idx = Index([0, 1, 2, 5, 6, 7, 9, 10]) n = len(idx) self.assertEquals(idx.slice_locs(start=2), (2, n)) self.assertEquals(idx.slice_locs(start=3), (3, n)) self.assertEquals(idx.slice_locs(3, 8), (3, 6)) self.assertEquals(idx.slice_locs(5, 10), (3, n)) self.assertEquals(idx.slice_locs(end=8), (0, 6)) self.assertEquals(idx.slice_locs(end=9), (0, 7))
def test_append_multiple(self): index = Index(["a", "b", "c", "d", "e", "f"]) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) self.assert_(result.equals(index)) # empty result = index.append([]) self.assert_(result.equals(index))
def _sort_labels(uniques, left, right): if not isinstance(uniques, np.ndarray): # tuplesafe uniques = Index(uniques).values sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int32) reverse_indexer.put(sorter, np.arange(len(sorter))) return reverse_indexer.take(left), reverse_indexer.take(right)
def test_union_noncomparable(self): from datetime import datetime, timedelta # corner case, non-Int64Index now = datetime.now() other = Index([now + timedelta(i) for i in xrange(4)]) result = self.index.union(other) expected = np.concatenate((self.index, other)) self.assert_(np.array_equal(result, expected)) result = other.union(self.index) expected = np.concatenate((other, self.index)) self.assert_(np.array_equal(result, expected))
def test_append_empty_preserve_name(self): left = Index([], name="foo") right = Index([1, 2, 3], name="foo") result = left.append(right) self.assert_(result.name == "foo") left = Index([], name="foo") right = Index([1, 2, 3], name="bar") result = left.append(right) self.assert_(result.name is None)
def rename_items(self, mapper, copydata=True): new_items = Index([mapper(x) for x in self.items]) new_items._verify_integrity() new_blocks = [] for block in self.blocks: newb = block.copy(deep=copydata) newb.set_ref_items(new_items, maybe_rename=True) new_blocks.append(newb) new_axes = list(self.axes) new_axes[0] = new_items return BlockManager(new_blocks, new_axes)
def union(self, other): if isinstance(other, DateRange) and other.offset == self.offset: # overlap condition if self[-1] >= other[0] or other[-1] >= self[0]: start = min(self[0], other[0]) end = max(self[-1], other[-1]) return DateRange(start, end, offset=self.offset) else: return Index.union(self, other) else: return Index.union(self, other)
def _set_grouper(self, obj, sort=False): """ given an object and the specifcations, setup the internal grouper for this particular specification Parameters ---------- obj : the subject object """ # NOTE: the following code is based on the base Grouper class with # additional hook to specify custom sorter if self.key is not None and self.level is not None: raise ValueError( "The Grouper cannot specify both a key and a level!") # the key must be a valid info item if self.key is not None: key = self.key if key not in obj._info_axis: raise KeyError("The grouper name {0} is not found".format(key)) ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) if self.level is not None: level = self.level # if a level is given it must be a mi level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) ax = Index(ax.get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): raise ValueError( "The level {0} is not valid".format(level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: # The following line is different from the base class for # possible extension. ax, indexer = self._make_sorter(ax) self.indexer = indexer obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) self.obj = obj self.grouper = ax return self.grouper
def test_isin(self): values = ['foo', 'bar'] idx = Index(['qux', 'baz', 'foo', 'bar']) result = idx.isin(values) expected = np.array([False, False, True, True]) self.assert_(np.array_equal(result, expected)) # empty, return dtype bool idx = Index([]) result = idx.isin(values) self.assert_(len(result) == 0) self.assert_(result.dtype == np.bool_)
def test_slice_locs(self): idx = Index([0, 1, 2, 5, 6, 7, 9, 10]) n = len(idx) self.assertEquals(idx.slice_locs(start=2), (2, n)) self.assertEquals(idx.slice_locs(start=3), (3, n)) self.assertEquals(idx.slice_locs(3, 8), (3, 6)) self.assertEquals(idx.slice_locs(5, 10), (3, n)) self.assertEquals(idx.slice_locs(end=8), (0, 6)) self.assertEquals(idx.slice_locs(end=9), (0, 7)) idx2 = idx[::-1] self.assertRaises(KeyError, idx2.slice_locs, 8, 2) self.assertRaises(KeyError, idx2.slice_locs, 7, 3)
def __setstate__(self, aug_state): """Necessary for making this object picklable""" index_state = aug_state[:1] offset = aug_state[1] # for backwards compatibility if len(aug_state) > 2: tzinfo = aug_state[2] else: # pragma: no cover tzinfo = None self.offset = offset self.tzinfo = tzinfo Index.__setstate__(self, *index_state)
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: keys = com.dict_keys_to_ordered_list(data) columns = Index(keys) if index is None: index = extract_index(list(data.values())) def sp_maker(x): return SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = v.copy() else: if isinstance(v, dict): v = [v.get(i, np.nan) for i in index] v = sp_maker(v) if index is not None and len(v) != len(index): msg = "Length of passed values is {}, index implies {}" raise ValueError(msg.format(len(v), len(index))) sdict[k] = v if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_arr = np.empty(len(index), dtype='float64') nan_arr.fill(np.nan) nan_arr = SparseArray(nan_arr, kind=self._default_kind, fill_value=self._default_fill_value, copy=False) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index)
def shift(self, n, offset=None): """ Specialized shift which produces a DateRange Parameters ---------- n : int Periods to shift by offset : DateOffset or timedelta-like, optional Returns ------- shifted : DateRange """ if offset is not None and offset != self.offset: return Index.shift(self, n, offset) if n == 0: # immutable so OK return self start = self[0] + n * self.offset end = self[-1] + n * self.offset return DateRange(start, end, offset=self.offset, name=self.name)
def get_value(self, series, key): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ try: return Index.get_value(self, series, key) except KeyError: try: loc = self._get_string_slice(key) return series[loc] except (TypeError, ValueError, KeyError): pass if isinstance(key, time): locs = self._indices_at_time(key) return series.take(locs) stamp = Timestamp(key) try: return self._engine.get_value(series, stamp) except KeyError: raise KeyError(stamp)
def get_value(self, series, key): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ if _is_convertible_to_td(key): key = Timedelta(key) return self.get_value_maybe_box(series, key) try: return _maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: try: loc = self._get_string_slice(key) return series[loc] except (TypeError, ValueError, KeyError): pass try: return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): raise KeyError(key)
def shift(self, n, freq=None): """ Specialized shift which produces a DatetimeIndex Parameters ---------- n : int Periods to shift by freq : DateOffset or timedelta-like, optional Returns ------- shifted : DatetimeIndex """ if freq is not None and freq != self.offset: if isinstance(freq, basestring): freq = to_offset(freq) result = Index.shift(self, n, freq) result.tz = self.tz return result if n == 0: # immutable so OK return self if self.offset is None: raise ValueError("Cannot shift with no offset") start = self[0] + n * self.offset end = self[-1] + n * self.offset return DatetimeIndex(start=start, end=end, freq=self.offset, name=self.name, tz=self.tz)
def _wrap_result(self, result, **kwargs): # leave as it is to keep extract and get_dummies results # can be merged to _wrap_result_expand in v0.17 from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.index import Index if not hasattr(result, 'ndim'): return result name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name if result.ndim == 1: if isinstance(self.series, Index): # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if is_bool_dtype(result): return result return Index(result, name=name) return Series(result, index=self.series.index, name=name) else: assert result.ndim < 3 return DataFrame(result, index=self.series.index)
def test_join_level(self): def _check_how(other, how): join_index, lidx, ridx = other.join(self.index, how=how, level='second', return_indexers=True) exp_level = other.join(self.index.levels[1], how=how) self.assert_(join_index.levels[0].equals(self.index.levels[0])) self.assert_(join_index.levels[1].equals(exp_level)) # pare down levels mask = np.array([x[1] in exp_level for x in self.index], dtype=bool) exp_values = self.index.values[mask] self.assert_(np.array_equal(join_index.values, exp_values)) if how in ('outer', 'inner'): join_index2, ridx2, lidx2 = \ self.index.join(other, how=how, level='second', return_indexers=True) self.assert_(join_index.equals(join_index2)) self.assert_(np.array_equal(lidx, lidx2)) self.assert_(np.array_equal(ridx, ridx2)) self.assert_(np.array_equal(join_index2.values, exp_values)) def _check_all(other): _check_how(other, 'outer') _check_how(other, 'inner') _check_how(other, 'left') _check_how(other, 'right') _check_all(Index(['three', 'one', 'two'])) _check_all(Index(['one'])) _check_all(Index(['one', 'three'])) # some corner cases idx = Index(['three', 'one', 'two']) result = idx.join(self.index, level='second') self.assert_(isinstance(result, MultiIndex)) self.assertRaises(Exception, self.index.join, self.index, level=1)
def _default_index(n): from pandas.core.index import Index return Index(np.arange(n))
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col+'-catdef'] = index._data else: d = np.empty(size, dtype=t) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) index._labels.append(d) views[col] = d views[col+'-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype="M8[ns]") new_block = block.make_block_same_class( values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col+'-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = np.asarray(block.values, dtype='M8[ns]') else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def asobject(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object)
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
class SparsePanel(Panel): """ Sparse version of Panel Parameters ---------- frames : dict of DataFrame objects items : array-like major_axis : array-like minor_axis : array-like default_kind : {'block', 'integer'}, default 'block' Default sparse kind for converting Series to SparseSeries. Will not override SparseSeries passed into constructor default_fill_value : float Default fill_value for converting Series to SparseSeries. Will not override SparseSeries passed in Notes ----- """ ndim = 3 _typ = 'panel' _subtyp = 'sparse_panel' def __init__(self, frames, items=None, major_axis=None, minor_axis=None, default_fill_value=np.nan, default_kind='block', copy=False): if isinstance(frames, np.ndarray): new_frames = {} for item, vals in zip(items, frames): new_frames[item] = \ SparseDataFrame(vals, index=major_axis, columns=minor_axis, default_fill_value=default_fill_value, default_kind=default_kind) frames = new_frames if not (isinstance(frames, dict)): raise AssertionError() self.default_fill_value = fill_value = default_fill_value self.default_kind = kind = default_kind # pre-filter, if necessary if items is None: items = Index(sorted(frames.keys())) items = _ensure_index(items) (clean_frames, major_axis, minor_axis) = _convert_frames(frames, major_axis, minor_axis, kind=kind, fill_value=fill_value) self._frames = clean_frames # do we want to fill missing ones? for item in items: if item not in clean_frames: raise Exception('column %s not found in data' % item) self._items = items self.major_axis = major_axis self.minor_axis = minor_axis def _consolidate_inplace(self): # pragma: no cover # do nothing when DataFrame calls this method pass def __array_wrap__(self, result): return SparsePanel(result, items=self.items, major_axis=self.major_axis, minor_axis=self.minor_axis, default_kind=self.default_kind, default_fill_value=self.default_fill_value) @classmethod def from_dict(cls, data): """ Analogous to Panel.from_dict """ return SparsePanel(data) def to_dense(self): """ Convert SparsePanel to (dense) Panel Returns ------- dense : Panel """ return Panel(self.values, self.items, self.major_axis, self.minor_axis) def as_matrix(self): return self.values @property def values(self): # return dense values return np.array([self._frames[item].values for item in self.items]) # need a special property for items to make the field assignable _items = None def _get_items(self): return self._items def _set_items(self, new_items): new_items = _ensure_index(new_items) if isinstance(new_items, MultiIndex): raise NotImplementedError # need to create new frames dict old_frame_dict = self._frames old_items = self._items self._frames = dict((new_k, old_frame_dict[old_k]) for new_k, old_k in zip(new_items, old_items)) self._items = new_items items = property(fget=_get_items, fset=_set_items) # DataFrame's index major_axis = SparsePanelAxis('_major_axis', 'index') # DataFrame's columns / "items" minor_axis = SparsePanelAxis('_minor_axis', 'columns') def _get_item_cache(self, key): return self._frames[key] def __setitem__(self, key, value): if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) if not isinstance(value, SparseDataFrame): value = value.to_sparse(fill_value=self.default_fill_value, kind=self.default_kind) else: raise ValueError('only DataFrame objects can be set currently') self._frames[key] = value if key not in self.items: self._items = Index(list(self.items) + [key]) def set_value(self, item, major, minor, value): """ Quickly set single value at (item, major, minor) location Parameters ---------- item : item label (panel item) major : major axis label (panel item row) minor : minor axis label (panel item column) value : scalar Notes ----- This method *always* returns a new object. It is not particularly efficient but is provided for API compatibility with Panel Returns ------- panel : SparsePanel """ dense = self.to_dense().set_value(item, major, minor, value) return dense.to_sparse(kind=self.default_kind, fill_value=self.default_fill_value) def __delitem__(self, key): loc = self.items.get_loc(key) indices = lrange(loc) + lrange(loc + 1, len(self.items)) del self._frames[key] self._items = self._items.take(indices) def __getstate__(self): # pickling return (self._frames, com._pickle_array(self.items), com._pickle_array(self.major_axis), com._pickle_array(self.minor_axis), self.default_fill_value, self.default_kind) def __setstate__(self, state): frames, items, major, minor, fv, kind = state self.default_fill_value = fv self.default_kind = kind self._items = _ensure_index(com._unpickle_array(items)) self._major_axis = _ensure_index(com._unpickle_array(major)) self._minor_axis = _ensure_index(com._unpickle_array(minor)) self._frames = frames def copy(self): """ Make a (shallow) copy of the sparse panel Returns ------- copy : SparsePanel """ return SparsePanel(self._frames.copy(), items=self.items, major_axis=self.major_axis, minor_axis=self.minor_axis, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def to_frame(self, filter_observations=True): """ Convert SparsePanel to (dense) DataFrame Returns ------- frame : DataFrame """ if not filter_observations: raise TypeError('filter_observations=False not supported for ' 'SparsePanel.to_long') I, N, K = self.shape counts = np.zeros(N * K, dtype=int) d_values = {} d_indexer = {} for item in self.items: frame = self[item] values, major, minor = _stack_sparse_info(frame) # values are stacked column-major indexer = minor * N + major counts.put(indexer, counts.take(indexer) + 1) # cuteness d_values[item] = values d_indexer[item] = indexer # have full set of observations for each item mask = counts == I # for each item, take mask values at index locations for those sparse # values, and use that to select values values = np.column_stack([d_values[item][mask.take(d_indexer[item])] for item in self.items]) inds, = mask.nonzero() # still column major major_labels = inds % N minor_labels = inds // N index = MultiIndex(levels=[self.major_axis, self.minor_axis], labels=[major_labels, minor_labels]) df = DataFrame(values, index=index, columns=self.items) return df.sortlevel(level=0) to_long = deprecate('to_long', to_frame) toLong = deprecate('toLong', to_frame) def reindex(self, major=None, items=None, minor=None, major_axis=None, minor_axis=None, copy=False): """ Conform / reshape panel axis labels to new input labels Parameters ---------- major : array-like, default None items : array-like, default None minor : array-like, default None copy : boolean, default False Copy underlying SparseDataFrame objects Returns ------- reindexed : SparsePanel """ major = com._mut_exclusive(major, major_axis) minor = com._mut_exclusive(minor, minor_axis) if com._all_none(items, major, minor): raise ValueError('Must specify at least one axis') major = self.major_axis if major is None else major minor = self.minor_axis if minor is None else minor if items is not None: new_frames = {} for item in items: if item in self._frames: new_frames[item] = self._frames[item] else: raise NotImplementedError('Reindexing with new items not yet ' 'supported') else: new_frames = self._frames if copy: new_frames = dict((k, v.copy()) for k, v in compat.iteritems(new_frames)) return SparsePanel(new_frames, items=items, major_axis=major, minor_axis=minor, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def _combine(self, other, func, axis=0): if isinstance(other, DataFrame): return self._combineFrame(other, func, axis=axis) elif isinstance(other, Panel): return self._combinePanel(other, func) elif np.isscalar(other): new_frames = dict((k, func(v, other)) for k, v in compat.iteritems(self)) return self._new_like(new_frames) def _combineFrame(self, other, func, axis=0): index, columns = self._get_plane_axes(axis) axis = self._get_axis_number(axis) other = other.reindex(index=index, columns=columns) if axis == 0: new_values = func(self.values, other.values) elif axis == 1: new_values = func(self.values.swapaxes(0, 1), other.values.T) new_values = new_values.swapaxes(0, 1) elif axis == 2: new_values = func(self.values.swapaxes(0, 2), other.values) new_values = new_values.swapaxes(0, 2) # TODO: make faster! new_frames = {} for item, item_slice in zip(self.items, new_values): old_frame = self[item] ofv = old_frame.default_fill_value ok = old_frame.default_kind new_frames[item] = SparseDataFrame(item_slice, index=self.major_axis, columns=self.minor_axis, default_fill_value=ofv, default_kind=ok) return self._new_like(new_frames) def _new_like(self, new_frames): return SparsePanel(new_frames, self.items, self.major_axis, self.minor_axis, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def _combinePanel(self, other, func): items = self.items + other.items major = self.major_axis + other.major_axis minor = self.minor_axis + other.minor_axis # could check that everything's the same size, but forget it this = self.reindex(items=items, major=major, minor=minor) other = other.reindex(items=items, major=major, minor=minor) new_frames = {} for item in items: new_frames[item] = func(this[item], other[item]) if not isinstance(other, SparsePanel): new_default_fill = self.default_fill_value else: # maybe unnecessary new_default_fill = func(self.default_fill_value, other.default_fill_value) return SparsePanel(new_frames, items, major, minor, default_fill_value=new_default_fill, default_kind=self.default_kind) def major_xs(self, key): """ Return slice of panel along major axis Parameters ---------- key : object Major axis label Returns ------- y : DataFrame index -> minor axis, columns -> items """ slices = dict((k, v.xs(key)) for k, v in compat.iteritems(self)) return DataFrame(slices, index=self.minor_axis, columns=self.items) def minor_xs(self, key): """ Return slice of panel along minor axis Parameters ---------- key : object Minor axis label Returns ------- y : SparseDataFrame index -> major axis, columns -> items """ slices = dict((k, v[key]) for k, v in compat.iteritems(self)) return SparseDataFrame(slices, index=self.major_axis, columns=self.items, default_fill_value=self.default_fill_value, default_kind=self.default_kind)
def aggregate(self, func_or_funcs, *args, **kwargs): """ Apply aggregation function or functions to groups, yielding most likely Series but in some cases DataFrame depending on the output of the aggregation function Parameters ---------- func_or_funcs : function or list / dict of functions List/dict of functions will produce DataFrame with column names determined by the function names themselves (list) or the keys in the dict Notes ----- agg is an alias for aggregate. Use it. Example ------- >>> series bar 1.0 baz 2.0 qot 3.0 qux 4.0 >>> mapper = lambda x: x[0] # first letter >>> grouped = series.groupby(mapper) >>> grouped.aggregate(np.sum) b 3.0 q 7.0 >>> grouped.aggregate([np.sum, np.mean, np.std]) mean std sum b 1.5 0.5 3 q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 See also -------- apply, transform Returns ------- Series or DataFrame """ if isinstance(func_or_funcs, basestring): return getattr(self, func_or_funcs)(*args, **kwargs) if hasattr(func_or_funcs,'__iter__'): ret = self._aggregate_multiple_funcs(func_or_funcs) else: if len(self.groupings) > 1: return self._python_agg_general(func_or_funcs, *args, **kwargs) try: return self._python_agg_general(func_or_funcs, *args, **kwargs) except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) index = Index(sorted(result), name=self.groupings[0].name) ret = Series(result, index=index) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' return ret
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode self.encoding = encoding self.compression = compression if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and date_format is not None): self.data_index = Index([ x.strftime(date_format) if notna(x) else '' for x in self.data_index ]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = _ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) > 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize < values.dtype.itemsize: values = _possibly_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def _get_fresh_axis(self): return Index(np.arange(len(self._get_concat_axis())))
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = key.nonzero() return self.obj.take(inds, axis=axis, convert=False) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr): if labels.inferred_type != 'integer': keyarr = np.where(keyarr < 0, len(labels) + keyarr, keyarr) if labels.inferred_type == 'mixed-integer': indexer = labels.get_indexer(keyarr) if (indexer >= 0).all(): self.obj.take(indexer, axis=axis, convert=True) else: return self.obj.take(keyarr, axis=axis) elif not labels.inferred_type == 'integer': return self.obj.take(keyarr, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None if labels.is_unique and Index(keyarr).is_unique: return _reindex(keyarr, level=level) else: indexer, missing = labels.get_indexer_non_unique(keyarr) check = indexer != -1 result = self.obj.take(indexer[check], axis=axis, convert=False) # need to merge the result labels and the missing labels if len(missing): l = np.arange(len(indexer)) missing = com._ensure_platform_int(missing) missing_labels = keyarr.take(missing) missing_indexer = com._ensure_int64(l[~check]) cur_labels = result._get_axis(axis).values cur_indexer = com._ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]),dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values new_indexer[missing_indexer] = -1 # reindex with the specified axis ndim = self.obj.ndim if axis+1 > ndim: raise AssertionError("invalid indexing error with non-unique index") args = [None] * (2*ndim) args[2*axis] = new_labels args[2*axis+1] = new_indexer result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan) return result
def simpleParser(nestedList, colNames=None, header=0, indexCol=0): """ Workhorse function for processing nested list into DataFrame Should be replaced by np.genfromtxt """ naValues = set([ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '1.#INF', '-1.#INF', '1.#INF000000', 'NA', 'NULL', 'NaN', 'nan', '' ]) lines = nestedList data = {} if header is not None: columns = [] for i, c in enumerate(lines[header]): if c == '': columns.append('Unnamed: ' + string.ascii_uppercase[i]) else: columns.append(c) content = lines[header + 1:] colCounts = dict(((col, 0) for col in columns)) for i, col in enumerate(columns): if columns.count(col) > 1: columns[i] = col + str(colCounts[col]) colCounts[col] += 1 else: if not colNames: columns = string.ascii_uppercase[:len(lines[0])] else: columns = colNames content = lines for i, (c, col) in enumerate(izip(columns, izip(*content))): if i == indexCol: data[c] = col continue data[c] = [] for val in col: if val in naValues: val = np.nan else: try: tmp = val val = np.float64(val) if np.isinf(val): val = tmp except Exception: pass data[c].append(val) if header is not None: if 'date' in columns[0].lower() or 'Unnamed' in columns[0]: dates = [] for s in data[columns[0]]: try: dates.append(parser.parse(s)) except Exception: dates.append(s) data[columns[0]] = dates for c, values in data.iteritems(): try: data[c] = np.array(values, dtype=np.float64) except Exception: data[c] = np.array(values, dtype=np.object_) if indexCol is not None: index = Index(data[columns[indexCol]]) frameData = dict([(col, data[col]) for col in columns \ if col != columns[indexCol]]) return DataFrame(data=frameData, index=index) else: index = np.arange(len(data.values()[0])) frameData = dict([(col, data[col]) for col in columns]) return DataFrame(data=frameData, index=index)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self.parse_dates: index = lib.try_parse_dates(index, parser=self.date_parser) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: print 'Found %d NA values in the index' % na_count else: arrays = [] for arr in index: if self.parse_dates: arr = lib.try_parse_dates(arr, parser=self.date_parser) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index.get_duplicates() idx_str = 'Index' if not self.implicit_idx else 'Implicit index' err_msg = ('%s (columns %s) have duplicate values %s' % (idx_str, self.index_col, str(dups))) raise Exception(err_msg) if len(self.columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) data = _convert_to_ndarrays(data, self.na_values, self.verbose) return DataFrame(data=data, columns=self.columns, index=index)
def _insert_column(self, key): self.columns = Index(np.concatenate((self.columns, [key])))
def is_monotonic(self): # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic
class SparseDataFrame(DataFrame): """ DataFrame containing sparse floating point data in the form of SparseSeries objects Parameters ---------- data : same types as can be passed to DataFrame index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' Default sparse kind for converting Series to SparseSeries. Will not override SparseSeries passed into constructor default_fill_value : float Default fill_value for converting Series to SparseSeries. Will not override SparseSeries passed in """ _verbose_info = False _columns = None _series = None _is_mixed_type = False ndim = 2 def __init__(self, data=None, index=None, columns=None, default_kind='block', default_fill_value=None): if default_fill_value is None: default_fill_value = np.nan self.default_kind = default_kind self.default_fill_value = default_fill_value if isinstance(data, dict): sdict, columns, index = self._init_dict(data, index, columns) elif isinstance(data, (np.ndarray, list)): sdict, columns, index = self._init_matrix(data, index, columns) elif isinstance(data, DataFrame): sdict, columns, index = self._init_dict(data, data.index, data.columns) elif data is None: sdict = {} if index is None: index = Index([]) if columns is None: columns = Index([]) else: for c in columns: sdict[c] = Series(np.nan, index=index) self._series = sdict self.columns = columns self.index = index def _from_axes(self, data, axes): columns, index = axes return self._constructor(data, index=index, columns=columns) @cache_readonly def _data(self): return _SparseMockBlockManager(self) def _consolidate_inplace(self): # do nothing when DataFrame calls this method pass def convert_objects(self): # XXX return self @property def _constructor(self): def wrapper(data, index=None, columns=None): return SparseDataFrame(data, index=index, columns=columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) return wrapper def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = _ensure_index(columns) data = dict((k, v) for k, v in data.iteritems() if k in columns) else: columns = Index(_try_sort(data.keys())) if index is None: index = extract_index(data) sp_maker = lambda x: SparseSeries(x, index=index, kind=self.default_kind, fill_value=self.default_fill_value, copy=True) sdict = {} for k, v in data.iteritems(): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v) else: if isinstance(v, dict): v = [v.get(i, nan) for i in index] v = sp_maker(v) sdict[k] = v # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_vec = np.empty(len(index)) nan_vec.fill(nan) for c in columns: if c not in sdict: sdict[c] = sp_maker(nan_vec) return sdict, columns, index def _init_matrix(self, data, index, columns, dtype=None): data = _prep_ndarray(data, copy=False) N, K = data.shape if index is None: index = _default_index(N) if columns is None: columns = _default_index(K) if len(columns) != K: raise Exception('Column length mismatch: %d vs. %d' % (len(columns), K)) if len(index) != N: raise Exception('Index length mismatch: %d vs. %d' % (len(index), N)) data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) return self._init_dict(data, index, columns, dtype) def __array_wrap__(self, result): return SparseDataFrame(result, index=self.index, columns=self.columns, default_kind=self.default_kind, default_fill_value=self.default_fill_value) def __getstate__(self): series = dict( (k, (v.sp_index, v.sp_values)) for k, v in self.iteritems()) columns = self.columns index = self.index return (series, columns, index, self.default_fill_value, self.default_kind) def __setstate__(self, state): series, cols, idx, fv, kind = state if not isinstance(cols, Index): # pragma: no cover columns = _unpickle_array(cols) else: columns = cols if not isinstance(idx, Index): # pragma: no cover index = _unpickle_array(idx) else: index = idx series_dict = {} for col, (sp_index, sp_values) in series.iteritems(): series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, fill_value=fv) self._series = series_dict self.index = index self.columns = columns self.default_fill_value = fv self.default_kind = kind def to_dense(self): """ Convert to dense DataFrame Returns ------- df : DataFrame """ data = dict((k, v.to_dense()) for k, v in self.iteritems()) return DataFrame(data, index=self.index) def astype(self, dtype): raise NotImplementedError def copy(self, deep=True): """ Make a copy of this SparseDataFrame """ series = dict((k, v.copy()) for k, v in self.iteritems()) return SparseDataFrame(series, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) @property def density(self): """ Ratio of non-sparse points to total (dense) data points represented in the frame """ tot_nonsparse = sum( [ser.sp_index.npoints for _, ser in self.iteritems()]) tot = len(self.index) * len(self.columns) return tot_nonsparse / float(tot) #---------------------------------------------------------------------- # Support different internal rep'n of SparseDataFrame def _set_item(self, key, value): sp_maker = lambda x: SparseSeries(x, index=self.index, fill_value=self.default_fill_value, kind=self.default_kind) if hasattr(value, '__iter__'): if isinstance(value, Series): clean_series = value.reindex(self.index) if not isinstance(value, SparseSeries): clean_series = sp_maker(clean_series) else: clean_series = sp_maker(value) self._series[key] = clean_series # Scalar else: self._series[key] = sp_maker(value) if key not in self.columns: self._insert_column(key) def _insert_column(self, key): self.columns = Index(np.concatenate((self.columns, [key]))) def __delitem__(self, key): """ Delete column from DataFrame """ loc = self.columns.get_loc(key) del self._series[key] self._delete_column_index(loc) def _delete_column_index(self, loc): if loc == len(self.columns) - 1: new_columns = self.columns[:loc] else: new_columns = Index( np.concatenate((self.columns[:loc], self.columns[loc + 1:]))) self.columns = new_columns _index = None def _set_index(self, index): self._index = _ensure_index(index) for v in self._series.values(): v.index = self._index def _get_index(self): return self._index def _get_columns(self): return self._columns def _set_columns(self, cols): if len(cols) != len(self._series): raise Exception('Columns length %d did not match data %d!' % (len(cols), len(self._series))) self._columns = _ensure_index(cols) index = property(fget=_get_index, fset=_set_index) columns = property(fget=_get_columns, fset=_set_columns) def __getitem__(self, item): """ Retrieve column or slice from DataFrame """ try: # unsure about how kludgy this is s = self._series[item] s.name = item return s except (TypeError, KeyError): if isinstance(item, slice): date_rng = self.index[item] return self.reindex(date_rng) elif isinstance(item, np.ndarray): if len(item) != len(self.index): raise Exception('Item wrong length %d instead of %d!' % (len(item), len(self.index))) newIndex = self.index[item] return self.reindex(newIndex) else: # pragma: no cover raise @Appender(DataFrame.get_value.__doc__, indents=0) def get_value(self, index, col): s = self._series[col] return s.get_value(index) def set_value(self, index, col, value): """ Put single value at passed column and index Parameters ---------- index : row label col : column label value : scalar value Notes ----- This method *always* returns a new object. It is currently not particularly efficient (and potentially very expensive) but is provided for API compatibility with DataFrame Returns ------- frame : DataFrame """ dense = self.to_dense().set_value(index, col, value) return dense.to_sparse(kind=self.default_kind, fill_value=self.default_fill_value) def _slice(self, slobj, axis=0): if axis == 0: new_index = self.index[slobj] new_columns = self.columns else: new_index = self.index new_columns = self.columns[slobj] return self.reindex(index=new_index, columns=new_columns) def as_matrix(self, columns=None): """ Convert the frame to its Numpy-array matrix representation Columns are presented in sorted order unless a specific list of columns is provided. """ if columns is None: columns = self.columns if len(columns) == 0: return np.zeros((len(self.index), 0), dtype=float) return np.array([self[col].values for col in columns]).T values = property(as_matrix) def xs(self, key, axis=0, copy=False): """ Returns a row (cross-section) from the SparseDataFrame as a Series object. Parameters ---------- key : some index contained in the index Returns ------- xs : Series """ if axis == 1: data = self[key] return data i = self.index.get_loc(key) series = self._series values = [series[k][i] for k in self.columns] return Series(values, index=self.columns) #---------------------------------------------------------------------- # Arithmetic-related methods def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns if level is not None: raise NotImplementedError if self.empty and other.empty: return SparseDataFrame(index=new_index) new_data = {} if fill_value is not None: # TODO: be a bit more intelligent here for col in new_columns: if col in this and col in other: dleft = this[col].to_dense() dright = other[col].to_dense() result = dleft._binop(dright, func, fill_value=fill_value) result = result.to_sparse(fill_value=this[col].fill_value) new_data[col] = result else: for col in new_columns: if col in this and col in other: new_data[col] = func(this[col], other[col]) return self._constructor(data=new_data, index=new_index, columns=new_columns) def _combine_match_index(self, other, func, fill_value=None): new_data = {} if fill_value is not None: raise NotImplementedError new_index = self.index.union(other.index) this = self if self.index is not new_index: this = self.reindex(new_index) if other.index is not new_index: other = other.reindex(new_index) for col, series in this.iteritems(): new_data[col] = func(series.values, other.values) return self._constructor(new_data, index=new_index, columns=self.columns) def _combine_match_columns(self, other, func, fill_value): # patched version of DataFrame._combine_match_columns to account for # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, # where 3.0 is numpy.float64 and series is a SparseSeries. Still # possible for this to happen, which is bothersome if fill_value is not None: raise NotImplementedError new_data = {} union = intersection = self.columns if not union.equals(other.index): union = other.index.union(self.columns) intersection = other.index.intersection(self.columns) for col in intersection: new_data[col] = func(self[col], float(other[col])) return self._constructor(new_data, index=self.index, columns=union) def _combine_const(self, other, func): new_data = {} for col, series in self.iteritems(): new_data[col] = func(series, other) return self._constructor(data=new_data, index=self.index, columns=self.columns) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None): if level is not None: raise Exception('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) indexer = com._ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): values = series.values new = values.take(indexer) if need_mask: np.putmask(new, mask, fill_value) new_series[col] = new return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value) def _reindex_columns(self, columns, copy, level, fill_value, limit=None): if level is not None: raise Exception('Reindex by level not supported for sparse') if com.notnull(fill_value): raise NotImplementedError if limit: raise NotImplementedError # TODO: fill value handling sdict = dict((k, v) for k, v in self.iteritems() if k in columns) return SparseDataFrame(sdict, index=self.index, columns=columns, default_fill_value=self.default_fill_value) def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer, copy, fill_value): if columns is None: columns = self.columns new_arrays = {} for col in columns: if col not in self: continue if row_indexer is not None: new_arrays[col] = com.take_1d(self[col].values, row_indexer, fill_value=fill_value) else: new_arrays[col] = self[col] return self._constructor(new_arrays, index=index, columns=columns) def _rename_index_inplace(self, mapper): self.index = [mapper(x) for x in self.index] def _rename_columns_inplace(self, mapper): new_series = {} new_columns = [] for col in self.columns: new_col = mapper(col) if new_col in new_series: # pragma: no cover raise Exception('Non-unique mapping!') new_series[new_col] = self[col] new_columns.append(new_col) self.columns = new_columns self._series = new_series def take(self, indices, axis=0): """ Analogous to ndarray.take, return SparseDataFrame corresponding to requested indices along an axis Parameters ---------- indices : list / array of ints axis : {0, 1} Returns ------- taken : SparseDataFrame """ indices = com._ensure_platform_int(indices) new_values = self.values.take(indices, axis=axis) if axis == 0: new_columns = self.columns new_index = self.index.take(indices) else: new_columns = self.columns.take(indices) new_index = self.index return self._constructor(new_values, index=new_index, columns=new_columns) def add_prefix(self, prefix): f = (('%s' % prefix) + '%s').__mod__ return self.rename(columns=f) def add_suffix(self, suffix): f = ('%s' + ('%s' % suffix)).__mod__ return self.rename(columns=f) def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): if on is not None: raise NotImplementedError else: return self._join_index(other, how, lsuffix, rsuffix) def _join_index(self, other, how, lsuffix, rsuffix): if isinstance(other, Series): assert (other.name is not None) other = SparseDataFrame({other.name: other}, default_fill_value=self.default_fill_value) join_index = self.index.join(other.index, how=how) this = self.reindex(join_index) other = other.reindex(join_index) this, other = this._maybe_rename_join(other, lsuffix, rsuffix) result_series = this._series other_series = other._series result_series.update(other_series) return self._constructor(result_series, index=join_index) def _maybe_rename_join(self, other, lsuffix, rsuffix): intersection = self.columns.intersection(other.columns) if len(intersection) > 0: if not lsuffix and not rsuffix: raise Exception('columns overlap: %s' % intersection) def lrenamer(x): if x in intersection: return '%s%s' % (x, lsuffix) return x def rrenamer(x): if x in intersection: return '%s%s' % (x, rsuffix) return x this = self.rename(columns=lrenamer) other = other.rename(columns=rrenamer) else: this = self return this, other def transpose(self): """ Returns a DataFrame with the rows/columns switched. """ return SparseDataFrame(self.values.T, index=self.columns, columns=self.index, default_fill_value=self.default_fill_value, default_kind=self.default_kind) T = property(transpose) @Appender(DataFrame.count.__doc__) def count(self, axis=0, **kwds): return self.apply(lambda x: x.count(), axis=axis) def cumsum(self, axis=0): """ Return SparseDataFrame of cumulative sums over requested axis. Parameters ---------- axis : {0, 1} 0 for row-wise, 1 for column-wise Returns ------- y : SparseDataFrame """ return self.apply(lambda x: x.cumsum(), axis=axis) def shift(self, periods, freq=None, **kwds): """ Analogous to DataFrame.shift """ from pandas.core.series import _resolve_offset offset = _resolve_offset(freq, kwds) new_series = {} if offset is None: new_index = self.index for col, s in self.iteritems(): new_series[col] = s.shift(periods) else: new_index = self.index.shift(periods, offset) for col, s in self.iteritems(): new_series[col] = SparseSeries(s.sp_values, index=new_index, sparse_index=s.sp_index, fill_value=s.fill_value) return SparseDataFrame(new_series, index=new_index, columns=self.columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) def apply(self, func, axis=0, broadcast=False): """ Analogous to DataFrame.apply, for SparseDataFrame Parameters ---------- func : function Function to apply to each column axis : {0, 1} broadcast : bool, default False For aggregation functions, return object of same size with values propagated Returns ------- applied : Series or SparseDataFrame """ if not len(self.columns): return self if isinstance(func, np.ufunc): new_series = {} for k, v in self.iteritems(): applied = func(v) applied.fill_value = func(applied.fill_value) new_series[k] = applied return SparseDataFrame(new_series, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) else: if not broadcast: return self._apply_standard(func, axis) else: return self._apply_broadcast(func, axis) def applymap(self, func): """ Apply a function to a DataFrame that is intended to operate elementwise, i.e. like doing map(func, series) for each series in the DataFrame Parameters ---------- func : function Python function, returns a single value from a single value Returns ------- applied : DataFrame """ return self.apply(lambda x: map(func, x)) @Appender(DataFrame.fillna.__doc__) def fillna(self, value=None, method='pad', inplace=False, limit=None): new_series = {} for k, v in self.iterkv(): new_series[k] = v.fillna(value=value, method=method, limit=limit) if inplace: self._series = new_series return self else: return self._constructor(new_series, index=self.index, columns=self.columns)
def __reduce__(self): """Necessary for making this object picklable""" a, b, state = Index.__reduce__(self) aug_state = state, self.offset, self.tzinfo return a, b, aug_state
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def _wrap_union_result(self, other, result): # If we are here, _can_fast_union is false or other is not a # DateRange, so their union has to be an Index. name = self.name if self.name == other.name else None return Index(result, name=name)
def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : deprecated na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ if order is not None: msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926" warn(msg, FutureWarning, stacklevel=2) from pandas.core.index import Index from pandas.core.series import Series vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort( np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [ lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types) ] ]) sorter = com._ensure_platform_int( t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def groupby_agg(self, by, axis, agg, groupby_args, **kwargs): # Currently we only expect 'by' to be a projection of the same frame. # If 'by' holds a list of columns/series, then we create such projection # to re-use code. if not isinstance(by, DFAlgQueryCompiler): if is_list_like(by): by_cols = [] by_frames = [] for obj in by: if isinstance(obj, str): by_cols.append(obj) elif hasattr(obj, "_query_compiler"): by_frames.append(obj._query_compiler._modin_frame) else: raise NotImplementedError("unsupported groupby args") by_cols = Index.__new__(Index, data=by_cols, dtype=self.columns.dtype) by_frame = self.mask(col_indices=by_cols) if by_frames: by_frame = by_frame._concat(axis=1, other_modin_frames=by_frames, ignore_index=True) else: raise NotImplementedError("unsupported groupby args") else: by_frame = by._modin_frame if axis != 0: raise NotImplementedError("groupby is supported for axis = 0 only") base = by_frame._find_common_projections_base(self) if base is None: raise NotImplementedError("unsupported groupby args") if groupby_args["level"] is not None: raise NotImplementedError("levels are not supported for groupby") groupby_cols = by_frame.columns.tolist() agg_cols = [col for col in self.columns if col not in by_frame.columns] # Create new base where all required columns are computed. We don't allow # complex expressions to be a group key or an aggeregate operand. assert isinstance(by_frame._op, TransformNode), "unexpected by_frame" exprs = OrderedDict(((col, by_frame.ref(col)) for col in groupby_cols)) exprs.update(((col, self.ref(col)) for col in agg_cols)) exprs = translate_exprs_to_base(exprs, base) base_cols = Index.__new__(Index, data=list(exprs.keys()), dtype=self.columns.dtype) base = self.__constructor__( columns=base_cols, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(base, exprs, fold=True), index_cols=None, force_execution_mode=self._force_execution_mode, ) new_columns = [] index_cols = None if groupby_args["as_index"]: index_cols = groupby_cols.copy() else: new_columns = groupby_cols.copy() new_dtypes = by_frame._dtypes[groupby_cols].tolist() agg_exprs = OrderedDict() if isinstance(agg, str): for col in agg_cols: agg_exprs[col] = AggregateExpr(agg, base.ref(col)) else: assert isinstance(agg, dict), "unsupported aggregate type" multiindex = any(isinstance(v, list) for v in agg.values()) for k, v in agg.items(): if isinstance(v, list): for item in v: agg_exprs[(k, item)] = AggregateExpr(item, base.ref(k)) else: col_name = (k, v) if multiindex else k agg_exprs[col_name] = AggregateExpr(v, base.ref(k)) new_columns.extend(agg_exprs.keys()) new_dtypes.extend((x._dtype for x in agg_exprs.values())) new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args) new_frame = self.__constructor__( columns=new_columns, dtypes=new_dtypes, op=new_op, index_cols=index_cols, force_execution_mode=self._force_execution_mode, ) return new_frame
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if isinstance(data, dict): mgr = self._init_dict(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = {} if index is None: index = Index([]) else: index = _ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = dict_to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) NDFrame.__init__(self, mgr)
def __new__(cls, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index values = np.asarray(data) elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif np.isscalar(data): # pragma: no cover if index is None: raise Exception('must pass index!') values = np.empty(len(index)) values.fill(data) # TODO: more efficient values, sparse_index = make_sparse(values, kind=kind, fill_value=fill_value) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data assert (len(values) == sparse_index.npoints) if index is None: index = Index(np.arange(sparse_index.length)) index = _ensure_index(index) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=np.float64, copy=True) else: subarr = np.asarray(values, dtype=np.float64) if index.is_all_dates: cls = SparseTimeSeries # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = np.float64(fill_value) output.index = index output.name = name return output
def union(self, other): """ Specialized union for TimedeltaIndex objects. If combine overlapping ranges with the same DateOffset, will be much faster than Index.union Parameters ---------- other : TimedeltaIndex or array-like Returns ------- y : Index or TimedeltaIndex """ self._assert_can_do_setop(other) if not isinstance(other, TimedeltaIndex): try: other = TimedeltaIndex(other) except (TypeError, ValueError): pass this, other = self, other if this._can_fast_union(other): return this._fast_union(other) else: result = Index.union(this, other) if isinstance(result, TimedeltaIndex): if result.freq is None: result.freq = to_offset(result.inferred_freq) return result