def test_panel_concat_other_axes(self): panel = tm.makePanel() p1 = panel.ix[:, :5, :] p2 = panel.ix[:, 5:, :] result = concat([p1, p2], axis=1) tm.assert_panel_equal(result, panel) p1 = panel.ix[:, :, :2] p2 = panel.ix[:, :, 2:] result = concat([p1, p2], axis=2) tm.assert_panel_equal(result, panel) # if things are a bit misbehaved p1 = panel.ix[:2, :, :2] p2 = panel.ix[:, :, 2:] p1['ItemC'] = 'baz' result = concat([p1, p2], axis=2) expected = panel.copy() expected['ItemC'] = expected['ItemC'].astype('O') expected.ix['ItemC', :, :2] = 'baz' tm.assert_panel_equal(result, expected)
def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) levels = [['foo', 'baz'], ['one', 'two']] names = ['first', 'second'] result = concat([df, df2, df, df2], keys=[('foo', 'one'), ('foo', 'two'), ('baz', 'one'), ('baz', 'two')], levels=levels, names=names) expected = concat([df, df2, df, df2]) exp_index = MultiIndex(levels=levels + [[0]], labels=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], names=names + [None]) expected.index = exp_index assert_frame_equal(result, expected) # no names result = concat([df, df2, df, df2], keys=[('foo', 'one'), ('foo', 'two'), ('baz', 'one'), ('baz', 'two')], levels=levels) self.assertEqual(result.index.names, [None] * 3) # no levels result = concat([df, df2, df, df2], keys=[('foo', 'one'), ('foo', 'two'), ('baz', 'one'), ('baz', 'two')], names=['first', 'second']) self.assertEqual(result.index.names, ['first', 'second'] + [None]) self.assert_(np.array_equal(result.index.levels[0], ['baz', 'foo']))
def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) # axis=0 df = DataFrame(np.random.randn(3, 4)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1]) exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]) expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1]) exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) # axis=1 df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) tm.assert_frame_equal(result, expected)
def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) levels = [["foo", "baz"], ["one", "two"]] names = ["first", "second"] result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], levels=levels, names=names, ) expected = concat([df, df2, df, df2]) exp_index = MultiIndex( levels=levels + [[0]], labels=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], names=names + [None] ) expected.index = exp_index assert_frame_equal(result, expected) # no names result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], levels=levels ) self.assertEqual(result.index.names, [None] * 3) # no levels result = concat( [df, df2, df, df2], keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], names=["first", "second"], ) self.assertEqual(result.index.names, ["first", "second"] + [None]) self.assert_(np.array_equal(result.index.levels[0], ["baz", "foo"]))
def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected)
def test_join_dups(self): # joining dups df = concat([DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20) .reshape(10, 2), columns=['A', 'C'])], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ['x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] assert_frame_equal(dta, expected)
def test_concat_series_axis1(self): ts = tm.makeTimeSeries() pieces = [ts[:-2], ts[2:], ts[2:-2]] result = concat(pieces, axis=1) expected = DataFrame(pieces).T assert_frame_equal(result, expected) result = concat(pieces, keys=['A', 'B', 'C'], axis=1) expected = DataFrame(pieces, index=['A', 'B', 'C']).T assert_frame_equal(result, expected)
def extend_contour_features(contour_data, extra_features): ''' extend the SALOMON features with added extra features. The extra features are inserted after SALOMON features and before other contour data from first_time (e.g. contour bin data) Parameters --------- extra_features - DataFr with features ''' if extra_features is not None: dfFeatures = concat([contour_data.ix[:, 0:12], extra_features], axis=1) startIdx = contour_data.columns.get_loc('first_time') contour_data = concat([dfFeatures, contour_data.ix[:, startIdx:]], axis=1) return contour_data
def _wrap_frames(self, keys, values, not_indexed_same=False): from pandas.tools.merge import concat if not_indexed_same: group_keys = keys group_levels = [ping.group_index for ping in self.groupings] group_names = [ping.name for ping in self.groupings] result = concat(values, axis=self.axis, keys=group_keys, levels=group_levels, names=group_names) else: result = concat(values, axis=self.axis) ax = self.obj._get_axis(self.axis) result = result.reindex_axis(ax, axis=self.axis) return result
def _wrap_frames(self, keys, values, not_indexed_same=False): from pandas.tools.merge import concat, _concat_frames_hierarchical if not_indexed_same: group_keys = keys group_levels = [ping.group_index for ping in self.groupings] group_names = [ping.name for ping in self.groupings] result = concat(values, axis=self.axis, keys=group_keys, levels=group_levels, names=group_names) else: result = concat(values, axis=self.axis) ax = self.obj._get_axis(self.axis) result = result.reindex_axis(ax, axis=self.axis) return result
def get_pandas_df(self, bql, parameters=None, dialect='legacy'): """ Returns a Pandas DataFrame for the results produced by a BigQuery query. The DbApiHook method must be overridden because Pandas doesn't support PEP 249 connections, except for SQLite. See: https://github.com/pydata/pandas/blob/master/pandas/io/sql.py#L447 https://github.com/pydata/pandas/issues/6900 :param bql: The BigQuery SQL to execute. :type bql: string :param parameters: The parameters to render the SQL query with (not used, leave to override superclass method) :type parameters: mapping or iterable :param dialect: Dialect of BigQuery SQL – legacy SQL or standard SQL :type dialect: string in {'legacy', 'standard'}, default 'legacy' """ service = self.get_service() project = self._get_field('project') connector = BigQueryPandasConnector(project, service, dialect=dialect) schema, pages = connector.run_query(bql) dataframe_list = [] while len(pages) > 0: page = pages.pop() dataframe_list.append(gbq_parse_data(schema, page)) if len(dataframe_list) > 0: return concat(dataframe_list, ignore_index=True) else: return gbq_parse_data(schema, [])
def get_pandas_df(self, bql, parameters=None): """ Returns a Pandas DataFrame for the results produced by a BigQuery query. The DbApiHook method must be overridden because Pandas doesn't support PEP 249 connections, except for SQLite. See: https://github.com/pydata/pandas/blob/master/pandas/io/sql.py#L447 https://github.com/pydata/pandas/issues/6900 :param bql: The BigQuery SQL to execute. :type bql: string """ service = self.get_service() connection_extras = self._extras_dejson() project = connection_extras['project'] connector = BigQueryPandasConnector(project, service) schema, pages = connector.run_query(bql, verbose=False) dataframe_list = [] while len(pages) > 0: page = pages.pop() dataframe_list.append(gbq_parse_data(schema, page)) if len(dataframe_list) > 0: return concat(dataframe_list, ignore_index=True) else: return gbq_parse_data(schema, [])
def test_concat_exclude_none(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df[:5], None, None, df[5:]] result = concat(pieces) tm.assert_frame_equal(result, df) self.assertRaises(Exception, concat, [None, None])
def get_pandas_df(self, bql, parameters=None): """ Returns a Pandas DataFrame for the results produced by a BigQuery query. The DbApiHook method must be overridden because Pandas doesn't support PEP 249 connections, except for SQLite. See: https://github.com/pydata/pandas/blob/master/pandas/io/sql.py#L447 https://github.com/pydata/pandas/issues/6900 :param bql: The BigQuery SQL to execute. :type bql: string """ service = self.get_service() project = self._get_field('project') connector = BigQueryPandasConnector(project, service) schema, pages = connector.run_query(bql) dataframe_list = [] while len(pages) > 0: page = pages.pop() dataframe_list.append(gbq_parse_data(schema, page)) if len(dataframe_list) > 0: return concat(dataframe_list, ignore_index=True) else: return gbq_parse_data(schema, [])
def get_series(self, data, column=None, column_label=None): ''' Get Quandl series ------ column: list the list of columns to get from Quandl ------ column_label: list the corresponding labels for each columns retrieved from Quandl ------ ''' if(len(column) != 0 and len(column)==len(column_label)): all_data = [] for i, item in enumerate(column): URL = "%sdatasets/%s.json?column=%d&auth_token=otf6VxzVxjm5ZGLztqbG" % (self.Root, data, item) try: response = urlopen(URL) results = json.loads(response.read()) points = {} for point in results['data']: date = to_datetime(point[0], format='%Y-%m-%d') if hasattr(date, 'to_datetime'): date = date.to_datetime() points[date]=point[1] data_label = column_label[i] points = DataFrame({data_label:Series(points)}) all_data.append(points) except HTTPError as exc: print exc.read() message = json.loads(exc.read()) raise ValueError("For %s, %s" % (data, message['error'])) points = concat(all_data, axis=1, join='outer') return points
def concat(self, frame, axis=0): if self._pandas: from pandas.tools.merge import concat return concat((self, frame), axis=axis) else: if axis == 0: if self._columns != frame._columns: raise ValueError( 'Cannot concat two frame of different columns') return ResultFrame(self._values + frame._values, columns=self._columns, index=self._index + frame._index, pandas=self._pandas) else: if self._index != frame._index: raise ValueError( 'Cannot concat two frames of different indexes') values = [ val + other for val, other in zip(self._values, frame._values) ] return ResultFrame(values, self._columns + frame._columns, index=self._index, pandas=self._pandas)
def test_concat_dataframe_keys_bug(self): t1 = DataFrame({"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))}) t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"]) self.assertEqual(list(result.columns), [("t1", "value"), ("t2", "value")])
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis(cat_axis, piece._get_axis( cat_axis)._to_safe_for_reshape()) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + lrange(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table( self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func, margins=True ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected)
def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] level = ["three", "two", "one", "zero"] result = concat(pieces, axis=1, keys=["one", "two", "three"], levels=[level], names=["group_key"]) self.assert_(np.array_equal(result.columns.levels[0], level)) self.assertEqual(result.columns.names[0], "group_key")
def _wrap_frames(self, keys, values, not_indexed_same=False): from pandas.tools.merge import concat if not_indexed_same: group_keys = keys group_levels = self.grouper.levels group_names = self.grouper.names result = concat(values, axis=self.axis, keys=group_keys, levels=group_levels, names=group_names) else: result = concat(values, axis=self.axis) ax = self.obj._get_axis(self.axis) result = result.reindex_axis(ax, axis=self.axis) return result
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected)
def test_concat_series(self): ts = tm.makeTimeSeries() ts.name = 'foo' pieces = [ts[:5], ts[5:15], ts[15:]] result = concat(pieces) tm.assert_series_equal(result, ts) self.assertEqual(result.name, ts.name) result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], labels=exp_labels) expected.index = exp_index tm.assert_series_equal(result, expected)
def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] level = ['three', 'two', 'one', 'zero'] result = concat(pieces, axis=1, keys=['one', 'two', 'three'], levels=[level], names=['group_key']) self.assert_(np.array_equal(result.columns.levels[0], level)) self.assertEqual(result.columns.names[0], 'group_key')
def test_concat_dataframe_keys_bug(self): t1 = DataFrame({'value': Series([1,2,3], index=Index(['a', 'b', 'c'], name='id'))}) t2 = DataFrame({'value': Series([7, 8], index=Index(['a', 'b'], name = 'id'))}) # it works result = concat([t1, t2], axis=1, keys=['t1', 't2']) self.assertEqual(list(result.columns), [('t1', 'value'), ('t2', 'value')])
def test_concat_series(self): ts = tm.makeTimeSeries() ts.name = "foo" pieces = [ts[:5], ts[5:15], ts[15:]] result = concat(pieces) tm.assert_series_equal(result, ts) self.assertEqual(result.name, ts.name) result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[us]")) exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], labels=exp_labels) expected.index = exp_index tm.assert_series_equal(result, expected)
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected)
def _wrap_frames(self, keys, values, not_indexed_same=False): from pandas.tools.merge import concat, _concat_frames_hierarchical if not_indexed_same: result = _concat_frames_hierarchical(values, keys, self.groupings, axis=self.axis) else: result = concat(values, axis=0).reindex(self.obj.index) return result
def test_concat_series(self): ts = tm.makeTimeSeries() ts.name = 'foo' pieces = [ts[:5], ts[5:15], ts[15:]] result = concat(pieces) tm.assert_series_equal(result, ts) self.assertEqual(result.name, ts.name) result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], labels=exp_labels) expected.index = exp_index tm.assert_series_equal(result, expected) self.assertRaises(Exception, concat, pieces, axis=1)
def transform(self, func, *args, **kwargs): """ Call function producing a like-indexed DataFrame on each group and return a DataFrame having the same indexes as the original object filled with the transformed values Parameters ---------- f : function Function to apply to each subframe Note ---- Each subframe is endowed the attribute 'name' in case you need to know which group you are working on. Example -------- >>> grouped = df.groupby(lambda x: mapping[x]) >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) """ from pandas.tools.merge import concat applied = [] obj = self._obj_with_exclusions for name, group in self: group.name = name try: wrapper = lambda x: func(x, *args, **kwargs) res = group.apply(wrapper, axis=self.axis) except Exception: # pragma: no cover res = func(group, *args, **kwargs) # broadcasting if isinstance(res, Series): if res.index is obj.index: group.T.values[:] = res else: group.values[:] = res applied.append(group) else: applied.append(res) concat_index = obj.columns if self.axis == 0 else obj.index concatenated = concat(applied, join_axes=[concat_index], axis=self.axis, verify_integrity=False) return concatenated.reindex_like(obj)
def join(self, other, how='left', lsuffix='', rsuffix=''): """ Join items with other Panel either on major and minor axes column Parameters ---------- other : Panel or list of Panels Index should be similar to one of the columns in this one how : {'left', 'right', 'outer', 'inner'} How to handle indexes of the two objects. Default: 'left' for joining on index, None otherwise * left: use calling frame's index * right: use input frame's index * outer: form union of indexes * inner: use intersection of indexes lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string Suffix to use from right frame's overlapping columns Returns ------- joined : Panel """ from pandas.tools.merge import concat if isinstance(other, Panel): join_major, join_minor = self._get_join_index(other, how) this = self.reindex(major=join_major, minor=join_minor) other = other.reindex(major=join_major, minor=join_minor) merged_data = this._data.merge(other._data, lsuffix, rsuffix) return self._constructor(merged_data) else: if lsuffix or rsuffix: raise ValueError( 'Suffixes not supported when passing multiple ' 'panels') if how == 'left': how = 'outer' join_axes = [self.major_axis, self.minor_axis] elif how == 'right': raise ValueError('Right join not supported with multiple ' 'panels') else: join_axes = None return concat([self] + list(other), axis=0, join=how, join_axes=join_axes, verify_integrity=True)
def forecast(self, forecast_start_str, forecast_period_in_days, periods_of_data_to_use): '''Perform the forecast and return forecast as pandas Series object''' #create forecast index forecast_index = date_range(forecast_start_str, periods=forecast_period_in_days) #Extract only that data which is necessary to make the first moving average calculation data_series = self.training_ts.tail(periods_of_data_to_use) forecast = Series() for time in forecast_index: #forecasted value is last value in rolling_mean list - all others are NaN because of forecast window length if self.forecast_method == 'ma': #Forecast using the simple moving average forecast_value = rolling_mean(data_series, periods_of_data_to_use).loc[-1] elif self.forecast_method == 'ewma': #forecast using the exponentially weighted moving average forecast_value = ewma(data_series, span=periods_of_data_to_use).loc[-1] #print forecast_value #remove 1-st value from data because its not needed for next forecasted value data_series = data_series[1:] #Append forecasted value to data because forecast is data for next iteration MA data_series = concat([data_series, Series(forecast_value, index=[time])]) forecast = concat([forecast, Series(forecast_value, index=[time])]) return forecast
def test_concat_multiindex_with_keys(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")) result = concat([frame, frame], keys=[0, 1], names=["iteration"]) self.assertEqual(result.index.names, ["iteration"] + index.names) tm.assert_frame_equal(result.ix[0], frame) tm.assert_frame_equal(result.ix[1], frame) self.assertEqual(result.index.nlevels, 3)
def test_concat_multiindex_with_keys(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) result = concat([frame, frame], keys=[0, 1], names=['iteration']) self.assertEqual(result.index.names, ['iteration'] + index.names) tm.assert_frame_equal(result.ix[0], frame) tm.assert_frame_equal(result.ix[1], frame) self.assertEqual(result.index.nlevels, 3)
def _add_margins(table, data, values, rows=None, cols=None, aggfunc=np.mean): if len(cols) > 0: col_margin = data[rows + values].groupby(rows).agg(aggfunc) # need to "interleave" the margins table_pieces = [] margin_keys = [] for key, piece in table.groupby(level=0, axis=1): all_key = (key, 'All') + ('',) * (len(cols) - 1) piece[all_key] = col_margin[key] table_pieces.append(piece) margin_keys.append(all_key) result = concat(table_pieces, axis=1) else: result = table margin_keys = table.columns grand_margin = {} for k, v in data[values].iteritems(): try: grand_margin[k] = aggfunc(v) except TypeError: pass if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + range(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) key = ('All',) + ('',) * (len(rows) - 1) row_margin = row_margin.reindex(result.columns) # populate grand margin for k in margin_keys: if len(cols) > 0: row_margin[k] = grand_margin[k[0]] else: row_margin[k] = grand_margin[k] margin_dummy = DataFrame(row_margin, columns=[key]).T result = result.append(margin_dummy) return result
def _add_margins(table, data, values, rows=None, cols=None, aggfunc=np.mean): if len(cols) > 0: col_margin = data[rows + values].groupby(rows).agg(aggfunc) # need to "interleave" the margins table_pieces = [] margin_keys = [] for key, piece in table.groupby(level=0, axis=1): all_key = (key, 'All') + ('', ) * (len(cols) - 1) piece[all_key] = col_margin[key] table_pieces.append(piece) margin_keys.append(all_key) result = concat(table_pieces, axis=1) else: result = table margin_keys = table.columns grand_margin = {} for k, v in data[values].iteritems(): try: grand_margin[k] = aggfunc(v) except TypeError: pass if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + range(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) key = ('All', ) + ('', ) * (len(rows) - 1) row_margin = row_margin.reindex(result.columns) # populate grand margin for k in margin_keys: if len(cols) > 0: row_margin[k] = grand_margin[k[0]] else: row_margin[k] = grand_margin[k] margin_dummy = DataFrame(row_margin, columns=[key]).T result = result.append(margin_dummy) return result
def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) baz = df[:5] baz['foo'] = 'bar' empty = df[5:5] frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0) expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] expected['foo'] = expected['foo'].astype('O') expected['foo'][:5] = 'bar' tm.assert_frame_equal(concatted, expected)
def test_concat_ignore_index(self): frame1 = DataFrame({"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}) frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) v1 = concat([frame1, frame2], axis=1, ignore_index=True) nan = np.nan expected = DataFrame( [[nan, nan, nan, 4.3], ["a", 1, 4.5, 5.2], ["b", 2, 3.2, 2.2], ["c", 3, 1.2, nan]], index=Index(["q", "x", "y", "z"]), ) tm.assert_frame_equal(v1, expected)
def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) baz = df[:5] baz["foo"] = "bar" empty = df[5:5] frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0) expected = df.ix[:, ["a", "b", "c", "d", "foo"]] expected["foo"] = expected["foo"].astype("O") expected["foo"][:5] = "bar" tm.assert_frame_equal(concatted, expected)
def join(self, other, how='left', lsuffix='', rsuffix=''): """ Join items with other Panel either on major and minor axes column Parameters ---------- other : Panel or list of Panels Index should be similar to one of the columns in this one how : {'left', 'right', 'outer', 'inner'} How to handle indexes of the two objects. Default: 'left' for joining on index, None otherwise * left: use calling frame's index * right: use input frame's index * outer: form union of indexes * inner: use intersection of indexes lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string Suffix to use from right frame's overlapping columns Returns ------- joined : Panel """ from pandas.tools.merge import concat if isinstance(other, Panel): join_major, join_minor = self._get_join_index(other, how) this = self.reindex(major=join_major, minor=join_minor) other = other.reindex(major=join_major, minor=join_minor) merged_data = this._data.merge(other._data, lsuffix, rsuffix) return self._constructor(merged_data) else: if lsuffix or rsuffix: raise ValueError('Suffixes not supported when passing ' 'multiple panels') if how == 'left': how = 'outer' join_axes = [self.major_axis, self.minor_axis] elif how == 'right': raise ValueError('Right join not supported with multiple ' 'panels') else: join_axes = None return concat([self] + list(other), axis=0, join=how, join_axes=join_axes, verify_integrity=True)
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, 'All') + ('', ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + lrange(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def _aggregate_multiple_funcs(self, arg, _level): from pandas.tools.merge import concat if self.axis != 0: raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: obj = self._selected_obj else: obj = self._obj_with_exclusions results = [] keys = [] # degenerate case if obj.ndim == 1: for a in arg: try: colg = self._gotitem(obj.name, ndim=1, subset=obj) results.append(colg.aggregate(a)) # make sure we find a good name name = com._get_callable_name(a) or a keys.append(name) except (TypeError, DataError): pass except SpecificationError: raise # multiples else: for col in obj: try: colg = self._gotitem(col, ndim=1, subset=obj[col]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): pass except SpecificationError: raise if _level: keys = None result = concat(results, keys=keys, axis=1) return result
def test_concat_ignore_index(self): frame1 = DataFrame({ "test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2] }) frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) v1 = concat([frame1, frame2], axis=1, ignore_index=True) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], ['a', 1, 4.5, 5.2], ['b', 2, 3.2, 2.2], ['c', 3, 1.2, nan]], index=Index(["q", "x", "y", "z"])) tm.assert_frame_equal(v1, expected)
def get_pandas_df(self, bql, parameters=None): """ Returns a Pandas DataFrame for the results produced by a BigQuery query. """ service = self.get_conn() connection_info = self.get_connection(self.bigquery_conn_id) connection_extras = connection_info.extra_dejson project = connection_extras['project'] connector = BigQueryPandasConnector(project, service) schema, pages = connector.run_query(bql, verbose=False) dataframe_list = [] while len(pages) > 0: page = pages.pop() dataframe_list.append(gbq_parse_data(schema, page)) if len(dataframe_list) > 0: return concat(dataframe_list, ignore_index=True) else: return gbq_parse_data(schema, [])
def describe(self): """ Describes this Categorical Returns ------- description: `DataFrame` A dataframe with frequency and counts by level. """ # Hack? from pandas.core.frame import DataFrame counts = DataFrame({ 'codes': self._codes, 'values': self._codes }).groupby('codes').count() freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts, freqs], axis=1) result.columns = ['counts', 'freqs'] # fill in the real levels check = result.index == -1 if check.any(): # Sort -1 (=NaN) to the last position index = np.arange(0, len(self.levels) + 1, dtype='int64') index[-1] = -1 result = result.reindex(index) # build new index levels = np.arange(0, len(self.levels) + 1, dtype=object) levels[:-1] = self.levels levels[-1] = np.nan result.index = levels.take(com._ensure_platform_int(result.index)) else: result.index = self.levels.take( com._ensure_platform_int(result.index)) result = result.reindex(self.levels) result.index.name = 'levels' return result
def main(): try: print "Attempting to read pickle-d DataFrames." df = {} df['survival'] = pd.DataFrame() df['mortality'] = pd.DataFrame() df['survival'] = pd.read_pickle('data/survival.pkl') df['mortality'] = pd.read_pickle('data/mortality.pkl') print "DataFrames imported!" except: print "Reading failed! Creating DataFrames." df = create_df() finally: # print "Creating mortality scatter plot matrix." # plt.figure() # pd.tools.plotting.scatter_matrix(df['mortality']) # F = plt.gcf() # F.set_size_inches((50, 50)) # F.savefig('graphs/scatter_mortality.png', # bbox_inches='tight', # dpi=150) # # print "Creating survival scatter plot matrix." # plt.figure() # pd.tools.plotting.scatter_matrix(df['survival']) # F = plt.gcf() # F.set_size_inches((50, 50)) # F.savefig('graphs/scatter_survival.png', # bbox_inches='tight', # dpi=150) print "Creating Andrew plot." plt.figure() pd.tools.plotting.andrews_curves( concat([df['mortality'][0::10], df['survival'][0::10]]), 'death') F = plt.gcf() F.set_size_inches((10, 10)) F.savefig('graphs/andrews_curves.png', bbox_inches='tight', dpi=150)
def _aggregate_multiple_funcs(self, arg): from pandas.tools.merge import concat if self.axis != 0: raise NotImplementedError obj = self._obj_with_exclusions results = [] keys = [] for col in obj: try: colg = SeriesGroupBy(obj[col], column=col, groupings=self.groupings) results.append(colg.agg(arg)) keys.append(col) except TypeError: pass result = concat(results, keys=keys, axis=1) return result