def test_meta(self): raise nose.SkipTest('no meta') meta = { 'foo' : [ 'I love pandas ' ] } s = tm.makeTimeSeries() s.meta = meta self.store['a'] = s self.assert_(self.store['a'].meta == meta) df = tm.makeDataFrame() df.meta = meta self.store['b'] = df self.assert_(self.store['b'].meta == meta) # this should work, but because slicing doesn't propgate meta it doesn self.store.remove('df1') self.store.append('df1', df[:10]) self.store.append('df1', df[10:]) results = self.store['df1'] #self.assert_(getattr(results,'meta',None) == meta) # no meta df = tm.makeDataFrame() self.store['b'] = df self.assert_(hasattr(self.store['b'],'meta') == False)
def test_describe_percentiles(self): with tm.assert_produces_warning(FutureWarning): desc = tm.makeDataFrame().describe(percentile_width=50) assert '75%' in desc.index assert '25%' in desc.index with tm.assert_produces_warning(FutureWarning): desc = tm.makeDataFrame().describe(percentile_width=95) assert '97.5%' in desc.index assert '2.5%' in desc.index
def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() self.store['foo/bar'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store) self.assert_('foo/bar' in self.store) self.assert_('/foo/bar' in self.store) self.assert_('/foo/b' not in self.store) self.assert_('bar' not in self.store)
def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0) # pathing self.store['a'] = ts self.store['b/foo'] = df self.store.remove('foo') self.store.remove('b/foo') self.assertEquals(len(self.store), 1) self.store['a'] = ts self.store['b/foo'] = df self.store.remove('b') self.assertEquals(len(self.store), 1) # __delitem__ self.store['a'] = ts self.store['b'] = df del self.store['a'] del self.store['b'] self.assertEquals(len(self.store), 0)
def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem df = tm.makeDataFrame() df['A'] # cache series df.ix["Hello Friend"] = df.ix[0] assert "Hello Friend" in df['A'].index assert "Hello Friend" in df['B'].index # 10264 df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[ 'a', 'b', 'c', 'd', 'e'], index=range(5)) df['f'] = 0 df.f.values[3] = 1 # TODO(wesm): unused? # y = df.iloc[np.arange(2, len(df))] df.f.values[3] = 2 expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) expected.at[3, 'f'] = 2 tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name='f') tm.assert_series_equal(df.f, expected)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() assert 'value' in df df = ser.reset_index(name='value2') assert 'value2' in df # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) tm.assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 rs = s.reset_index(level=[0, 2], drop=True) tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series)
def test_default_boxer_passthrough(self): """ When an autoboxer Series has an init param, sometimes we want a variable on the Series to be pass to that Series.init. It's possible to get this kind behavior via a function boxer, but this just makes it automated. """ class ASeries(UserSeries): def __init__(self, *args, **kwargs): self.bob = kwargs.pop('bob') class AutoBoxFrame(UserFrame): _default_boxer = ASeries _boxer_passthrough = ['bob'] df = tm.makeDataFrame() af = AutoBoxFrame(df) af.bob = 'hello' s = af["A"] # make sure we're not storing the data vai col_meta assert "bob" not in af._col_meta['A'] # verify that bob is passed to autoboxed series assert s.bob == 'hello'
def test_set_index_names(self): df = tm.makeDataFrame() df.index.name = 'name' assert df.set_index(df.index).index.names == ['name'] mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, names=['A', 'B', 'C', 'D']) df = df.set_index(['A', 'B']) assert df.set_index(df.index).index.names == ['A', 'B'] # Check that set_index isn't converting a MultiIndex into an Index assert isinstance(df.set_index(df.index).index, MultiIndex) # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) idx2 = df.index.rename(['C', 'D']) # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex) # Check equality tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
def test_supermeta(self): """ Test that supermeta metaclass acts like a super parent to both UserSeries and UserFrame """ class CommonBase(composition.PandasSuperMeta): """ Test common base """ _bob = object() @property def bob(self): return self._bob class CommonSeries(with_metaclass(CommonBase, UserSeries)): pass class CommonFrame(with_metaclass(CommonBase, UserFrame)): pass bob = CommonBase._bob s = CommonSeries(range(10)) assert s.ix[3] == 3 tm.assert_almost_equal(s, range(10)) assert s.bob is bob s._bob = 123 assert s.bob == 123 df = tm.makeDataFrame() fr = CommonFrame(df) tm.assert_frame_equal(fr, df) assert fr.bob is bob assert fr.tail().bob is bob
def test___init__(self): """ Test that supermeta metaclass acts like a super parent to both UserSeries and UserFrame """ class InitSeries(UserSeries): def __init__(self, *args, **kwargs): # required bob = kwargs.pop('bob') self.bob = bob super(InitSeries, self).__init__(*args, **kwargs) class InitFrame(UserFrame): def __init__(self, *args, **kwargs): # required bob = kwargs.pop('bob') self.bob = bob super(InitFrame, self).__init__(*args, **kwargs) s = InitSeries(range(10), name='hello', bob=123) assert s.bob == 123 df = tm.makeDataFrame() fr = InitFrame(df, bob='woot') assert fr.bob == 'woot'
def test_frame(self): df = tm.makeDataFrame() # put in some random NAs df.values[0, 0] = np.nan df.values[5, 3] = np.nan self._check_roundtrip_table(df, tm.assert_frame_equal) self._check_roundtrip(df, tm.assert_frame_equal) self._check_roundtrip_table(df, tm.assert_frame_equal, compression=True) self._check_roundtrip(df, tm.assert_frame_equal, compression=True) tdf = tm.makeTimeDataFrame() self._check_roundtrip(tdf, tm.assert_frame_equal) self._check_roundtrip(tdf, tm.assert_frame_equal, compression=True) # not consolidated df['foo'] = np.random.randn(len(df)) self.store['df'] = df recons = self.store['df'] self.assert_(recons._data.is_consolidated()) # empty self.assertRaises(ValueError, self._check_roundtrip, df[:0], tm.assert_frame_equal)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ["hash", "category"] ser.name = "value" df = ser.reset_index() self.assertIn("value", df) df = ser.reset_index(name="value2") self.assertIn("value2", df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) tm.assertIsInstance(rs, Series)
def test_len_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.assertEquals(len(self.store), 4) self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd']))
def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() repr(self.store)
def test_write_infer(self, ext, get_random_path): base = get_random_path path1 = base + ext path2 = base + ".raw" compression = None for c in self._compression_to_extension: if self._compression_to_extension[c] == ext: compression = c break with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() # write to compressed file by inferred compression method df.to_pickle(p1) # decompress with tm.decompress_file(p1, compression=compression) as f: with open(p2, "wb") as fh: fh.write(f.read()) # read decompressed file df2 = pd.read_pickle(p2, compression=None) tm.assert_frame_equal(df, df2)
def test_read_infer(self, ext, get_random_path): if ext == '.xz': tm._skip_if_no_lzma() base = get_random_path path1 = base + ".raw" path2 = base + ext compression = None for c in self._compression_to_extension: if self._compression_to_extension[c] == ext: compression = c break with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() # write to uncompressed file df.to_pickle(p1, compression=None) # compress self.compress_file(p1, p2, compression=compression) # read compressed file by inferred compression method df2 = pd.read_pickle(p2) tm.assert_frame_equal(df, df2)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() self.assertIn('value', df) df = ser.reset_index(name='value2') self.assertIn('value2', df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) tm.assertIsInstance(rs, Series)
def test_path_local_path(self, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df)
def test_read_bad_versions(self, protocol, get_random_path): # For Python 2, HIGHEST_PROTOCOL should be 2. msg = ("pickle protocol {protocol} asked for; the highest available " "protocol is 2").format(protocol=protocol) with tm.assert_raises_regex(ValueError, msg): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, protocol=protocol)
def test_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.assertEquals(len(self.store), 5) self.assert_(set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar']))
def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() missing_df.iloc[0]['A'] = np.nan with np.errstate(invalid='ignore'): expected = missing_df.values < 0 with np.errstate(invalid='raise'): result = (missing_df < 0).values tm.assert_numpy_array_equal(result, expected)
def _make_one(): df = tm.makeDataFrame() df['obj1'] = 'foo' df['obj2'] = 'bar' df['bool1'] = df['A'] > 0 df['bool2'] = df['B'] > 0 df['int1'] = 1 df['int2'] = 2 return df.consolidate()
def test_noop(self): df = ptesting.makeDataFrame() df.values[[2, 5, 10], [2, 3, 1]] = np.nan y, X = df[df.columns[0]], df[df.columns[1:]] data, _ = sm_data.handle_missing(y, X, missing='none') y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]] ptesting.assert_frame_equal(data['exog'], X_exp) ptesting.assert_series_equal(data['endog'], y_exp)
def test_repr(self): repr(self.store) self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.store['foo/bar'] = tm.makePanel() self.store.append('e', tm.makePanel()) repr(self.store) str(self.store)
def test_from_dict_mixed_orient(self): df = tm.makeDataFrame() df["foo"] = "bar" data = {"k1": df, "k2": df} panel = Panel.from_dict(data, orient="minor") self.assert_(panel["foo"].values.dtype == np.object_) self.assert_(panel["A"].values.dtype == np.float64)
def test_pandas_array(self): df = ptesting.makeDataFrame() df.values[[2, 5, 10], [2, 3, 1]] = np.nan y, X = df[df.columns[0]], df[df.columns[1:]].values data, _ = sm_data.handle_missing(y, X, missing="drop") df = df.dropna() y_exp, X_exp = df[df.columns[0]], df[df.columns[1:]].values np.testing.assert_array_equal(data["exog"], X_exp) ptesting.assert_series_equal(data["endog"], y_exp)
def test_array_pandas(self): df = ptesting.makeDataFrame() df.values[[2, 5, 10], [2, 3, 1]] = np.nan y, X = df[df.columns[0]].values, df[df.columns[1:]] data, _ = sm_data.handle_missing(y, X, missing='drop') df = df.dropna() y_exp, X_exp = df[df.columns[0]].values, df[df.columns[1:]] ptesting.assert_frame_equal(data['exog'], X_exp) np.testing.assert_array_equal(data['endog'], y_exp)
def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() self.store['a'] = ts self.store['b'] = df self.store.remove('a') self.assertEquals(len(self.store), 1) tm.assert_frame_equal(df, self.store['b']) self.store.remove('b') self.assertEquals(len(self.store), 0)
def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath)
def test_from_dict_mixed_orient(self): df = tm.makeDataFrame() df['foo'] = 'bar' data = {'k1' : df, 'k2' : df} panel = Panel.from_dict(data, orient='minor') self.assert_(panel['foo'].values.dtype == np.object_) self.assert_(panel['A'].values.dtype == np.float64)
def test_describe_percentiles_equivalence(self): df = tm.makeDataFrame() d1 = df.describe() d2 = df.describe(percentiles=[.25, .75]) assert_frame_equal(d1, d2)
def test_unknown_engine(self): with tm.ensure_clean() as path: df = tm.makeDataFrame() df.to_csv(path) with tm.assert_raises_regex(ValueError, 'Unknown engine'): read_csv(path, engine='pyt')
df = pd.DataFrame( {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) df["z"] = scalar dgf["z"] = scalar got = dgf.compute().to_pandas() np.testing.assert_array_equal(got["z"], df["z"]) @pytest.mark.parametrize( "func", [ lambda: tm.makeDataFrame().reset_index(), tm.makeDataFrame, tm.makeMixedDataFrame, tm.makeObjectSeries, tm.makeTimeSeries, ], ) def test_repr(func): pdf = func() try: gdf = cudf.from_pandas(pdf) except Exception: raise pytest.xfail() # gddf = dd.from_pandas(gdf, npartitions=3, sort=False) # TODO gddf = dd.from_pandas(gdf, npartitions=3, sort=False)
}), pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=pd.Index([4, 5, 6], name="bar")), pd.Series([1.0, 2.0, 3.0]), pd.Series([1.0, 2.0, 3.0], name="foo"), pd.Series([1.0, 2.0, 3.0], name="foo", index=[4, 5, 6]), pd.Series([1.0, 2.0, 3.0], name="foo", index=pd.Index([4, 5, 6], name="bar")), pd.DataFrame({"x": ["a", "b", "c"]}), pd.DataFrame({"x": [b"a", b"b", b"c"]}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=True)}), pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=False)}), tm.makeCategoricalIndex(), tm.makeCustomDataframe(5, 3), tm.makeDataFrame(), tm.makeDateIndex(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeObjectSeries(), tm.makePeriodFrame(), tm.makeRangeIndex(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeUnicodeIndex(), ] @pytest.mark.parametrize("df", dfs) def test_dumps_serialize_numpy(df): header, frames = serialize(df)
def test_len(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() self.assertEquals(len(self.store), 4)
def test_secondary_legend(self): import matplotlib.pyplot as plt fig = plt.gcf() plt.clf() ax = fig.add_subplot(211) #ts df = tm.makeTimeDataFrame() ax = df.plot(secondary_y=['A', 'B']) leg = ax.get_legend() self.assert_(len(leg.get_lines()) == 4) self.assert_(leg.get_texts()[0].get_text() == 'A (right)') self.assert_(leg.get_texts()[1].get_text() == 'B (right)') self.assert_(leg.get_texts()[2].get_text() == 'C') self.assert_(leg.get_texts()[3].get_text() == 'D') self.assert_(ax.right_ax.get_legend() is None) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) # TODO: color cycle problems self.assert_(len(colors) == 4) plt.clf() ax = fig.add_subplot(211) ax = df.plot(secondary_y=['A', 'C'], mark_right=False) leg = ax.get_legend() self.assert_(len(leg.get_lines()) == 4) self.assert_(leg.get_texts()[0].get_text() == 'A') self.assert_(leg.get_texts()[1].get_text() == 'B') self.assert_(leg.get_texts()[2].get_text() == 'C') self.assert_(leg.get_texts()[3].get_text() == 'D') plt.clf() ax = fig.add_subplot(211) df = tm.makeTimeDataFrame() ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assert_(len(leg.get_lines()) == 4) self.assert_(ax.right_ax.get_legend() is None) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) # TODO: color cycle problems self.assert_(len(colors) == 4) #non-ts df = tm.makeDataFrame() plt.clf() ax = fig.add_subplot(211) ax = df.plot(secondary_y=['A', 'B']) leg = ax.get_legend() self.assert_(len(leg.get_lines()) == 4) self.assert_(ax.right_ax.get_legend() is None) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) # TODO: color cycle problems self.assert_(len(colors) == 4) plt.clf() ax = fig.add_subplot(211) ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assert_(len(leg.get_lines()) == 4) self.assert_(ax.right_ax.get_legend() is None) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) # TODO: color cycle problems self.assert_(len(colors) == 4)
def test_read(self, protocol, get_random_path): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, protocol=protocol) df2 = pd.read_pickle(path) tm.assert_frame_equal(df, df2)
def make_dask_data_frame(npartitions): return dd.from_pandas(pandas_tm.makeDataFrame(), npartitions=npartitions)
def test_path_pathlib(self): df = tm.makeDataFrame().reset_index() result = tm.round_trip_pathlib(df.to_feather, pd.read_feather) tm.assert_frame_equal(df, result)
def test_load_data(client): client.load_data('testing', tm.makeDataFrame()) assert client.exists_table('testing') assert client.get_schema('testing')
def test_write_explicit_bad(self, compression, get_random_path): with tm.assert_raises_regex(ValueError, "Unrecognized compression type"): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression)
def test_describe_percentiles_percent_or_raw(self): df = tm.makeDataFrame() with tm.assertRaises(ValueError): df.describe(percentiles=[10, 50, 100])
def test_describe_quantiles_both(self): with tm.assertRaises(ValueError): tm.makeDataFrame().describe(percentile_width=50, percentiles=[25, 75])
def test_unknown_engine(self): with tm.ensure_clean() as path: df = tm.makeDataFrame() df.to_csv(path) with pytest.raises(ValueError, match='Unknown engine'): pd.read_csv(path, engine='pyt')
def test_pickle_path_localpath(): df = tm.makeDataFrame() result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) tm.assert_frame_equal(df, result)
def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store)
def test_path_localpath(self): df = tm.makeDataFrame() result = tm.round_trip_localpath(df.to_msgpack, read_msgpack) tm.assert_frame_equal(df, result)
def test_describe(self): desc = tm.makeDataFrame().describe() desc = tm.makeMixedDataFrame().describe() desc = tm.makeTimeDataFrame().describe()
def test_create_table(client): client.create_table('testing', obj=tm.makeDataFrame()) assert client.exists_table('testing') client.create_table('testingschema', schema=client.get_schema('testing')) assert client.exists_table('testingschema')