def test_groupby(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) gr = sdf20.groupby("key", lambda gr: gr.sum()) gr2 = df20.groupby("key").sum() self.assertEqualDataFrame(gr, gr2) self.assertRaise(lambda: sdf20.groupby( "key", in_memory=False), NotImplementedError) # Do not replace lambda c:sum(c) by sum or... # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)]) gr = sdf20.groupby("key", lambda gr: gr.agg( [numpy.sum, lambda c:sum(c)])) self.assertEqualDataFrame(gr, gr2) gr = sdf20.groupby("key", lambda gr: gr.count()) gr2 = df20.groupby("key").count() self.assertEqualDataFrame(gr, gr2) df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7])) sdf = StreamingDataFrame.read_df(df) gr = sdf.groupby("A") gr2 = df.groupby("A").sum() self.assertEqualDataFrame(gr, gr2)
def test_read_csv(self): temp = get_temp_folder(__file__, "temp_read_csv") df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) name = os.path.join(temp, "df.csv") name2 = os.path.join(temp, "df2.csv") name3 = os.path.join(temp, "df3.csv") df.to_csv(name, index=False) df.to_csv(name2, index=True) sdf = StreamingDataFrame.read_csv(name) text = sdf.to_csv(index=False) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, chunksize=None), ValueError) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, iterator=False), ValueError) sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) text2 = sdf2.to_csv(index=True) sdf2.to_csv(name3, index=True) with open(name, "r", encoding='utf-8') as f: exp = f.read() with open(name2, "r", encoding='utf-8') as f: exp2 = f.read() with open(name3, "r", encoding='utf-8') as f: text3 = f.read() self.assertEqual(text.replace('\r', ''), exp) sdf2 = StreamingDataFrame.read_df(df) self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) self.assertEqual(text2.replace('\r', ''), exp2) self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'), exp2.replace('\r', ''))
def test_read_json_raw(self): data = [{ 'id': 1, 'name': { 'first': 'Coleen', 'last': 'Volk' } }, { 'name': { 'given': 'Mose', 'family': 'Regner' } }, { 'id': 2, 'name': 'FayeRaker' }] exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"}, {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null}, {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null, "name.given":null,"name.last":null}]""".replace(" ", "").replace( "\n", "") self.assertRaise(lambda: StreamingDataFrame.read_json(data), NotImplementedError) it = StreamingDataFrame.read_json(data, flatten=True) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') js_read = loads(js) js_exp = loads(exp) self.assertEqual(js_exp, js_read)
def json_to_dataframe_streaming(js, chunksize=100000, flatten=False, **kwargs): """ Converts a big json dump (from @see fn convert_trace_to_json) to a dataframe. The function processes the data by streaming to avoid loading huge data in memory. Returns an iterator on dataframes. The function relies on :epkg:`pandas_streaming`. :param js: a filename, a json string, a stream containing json :param chunksize: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :param flatten: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :param kwargs: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :return: a dataframe """ from pandas_streaming.df import StreamingDataFrame # pylint: disable=C0415 if isinstance(js, str): if len(js) < 5000 and os.path.exists(js): sdf = StreamingDataFrame.read_json(js) else: raise RuntimeError( "Use a stream or function json_to_dataframe instead of " "the streaming version.") else: sdf = StreamingDataFrame.read_json(js) sdf['ts_sec'] = sdf['ts'].apply(lambda t: t / 1e9) return sdf
def test_set_item_function(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) self.assertRaise(lambda: StreamingDataFrame(df), TypeError) sdf = StreamingDataFrame.read_df(df) sdf['bb'] = sdf['b'].apply(lambda x: x + 11) df = sdf.to_df() ddf = ddf = pandas.DataFrame( data=dict(a=[4.5], b=[6], c=[7], bb=[17])) self.assertEqualDataFrame(df, ddf)
def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_schema_consistant(self): df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"), dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")]) temp = get_temp_folder(__file__, "temp_schema_consistant") name = os.path.join(temp, "df.csv") stio = StringIO() df.to_csv(stio, index=False) self.assertNotEmpty(stio.getvalue()) df.to_csv(name, index=False) self.assertEqual(df.shape, (4, 3)) sdf = StreamingDataFrame.read_csv(name, chunksize=2) self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) sdf = StreamingDataFrame.read_csv( name, chunksize=2, check_schema=False) pieces = list(sdf) self.assertEqual(len(pieces), 2)
def test_train_test_split_file_pattern(self): temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") sdf = dummy_streaming_dataframe(100) names = os.path.join(temp, "spl_{0}.txt") self.assertRaise(lambda: sdf.train_test_split( names, index=False, streaming=False), ValueError) names = os.path.join(temp, "spl_{}.txt") tr, te = sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(tr) tesdf = StreamingDataFrame.read_csv(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_read_json_ijson(self): it = StreamingDataFrame.read_json( BytesIO(TestDataFrameIOHelpers.text_json)) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records', lines=True) jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
def test_read_json_rows(self): data = '''{"a": 1, "b": 2} {"a": 3, "b": 4}''' it = StreamingDataFrame.read_json(StringIO(data), lines=True) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
def test_read_csv_names(self): this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data", "buggy_hash2.csv") df = pandas.read_csv(data, sep="\t", names=[ "A", "B", "C"], header=None) sdf = StreamingDataFrame.read_csv( data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None) head = sdf.head(n=1) self.assertEqualDataFrame(df.head(n=1), head)
def test_read_json_rows2(self): data = b'''{"a": 1, "b": 2} {"a": 3, "b": 4}''' dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
def test_train_test_split_file(self): temp = get_temp_folder(__file__, "temp_train_test_split_file") names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] sdf = dummy_streaming_dataframe(100) sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(names[0]) tesdf = StreamingDataFrame.read_csv(names[1]) self.assertGreater(trsdf.shape[0], 20) self.assertGreater(tesdf.shape[0], 20) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() self.assertGreater(trdf.shape[0], 20) self.assertGreater(tedf.shape[0], 20) df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_read_json_rows_file_lines_head(self): data = self.abs_path_join(__file__, 'data', 'example.json') dfs = pandas.read_json(data, orient='records', lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(data, lines="stream") h1 = it.head() h2 = it.head() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2)
def test_groupby_streaming(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( "key", lambda gr: gr.sum(), strategy='streaming', as_index=False) gr2 = df20.groupby("key", as_index=False).sum() grs = list(sgr) gr = pandas.concat(grs).groupby("key", as_index=False).sum() self.assertEqualDataFrame(gr, gr2)
def test_read_json_raw_head(self): data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, {'name': {'given': 'Mose', 'family': 'Regner'}}, {'id': 2, 'name': 'FayeRaker'}] it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1) h1 = it.head() h2 = it.head() self.assertEqualDataFrame(h1, h2) self.assertGreater(h1.shape[0], 1) self.assertGreater(h2.shape[0], 1)
def test_add_column(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.add_column("d", lambda row: int(1)) df2 = sdf2.to_dataframe() df["d"] = 1 self.assertEqualDataFrame(df, df2) sdf3 = StreamingDataFrame.read_df(df) sdf4 = sdf3.add_column("dd", 2) df4 = sdf4.to_dataframe() df["dd"] = 2 self.assertEqualDataFrame(df, df4) sdfA = StreamingDataFrame.read_df(df) sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10) dfB = sdfB.to_dataframe() df["dd12"] = 12 self.assertEqualDataFrame(df, dfB)
def test_read_json_rows2_head(self): data = b'''{"a": 1, "b": 2} {"a": 3, "b": 4}''' dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") h1 = it.head() h2 = it.head() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2)
def test_merge_2(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) df2 = pandas.concat([df, df]) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.concat(sdf, axis=0) self.assertEqualDataFrame(df2, sdf2.to_dataframe()) self.assertEqualDataFrame(df2, sdf2.to_dataframe()) m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20])) jm = df2.merge(m, left_on="Y", right_on="Y", how="outer") sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True), sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True))
def test_groupby_cum_asindex(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( "key", lambda gr: gr.sum(), strategy='cum', as_index=True) gr2 = df20.groupby("key", as_index=True).sum() lastgr = None for gr in sgr: self.assertEqual(list(gr.columns), list(gr2.columns)) lastgr = gr self.assertEqualDataFrame(lastgr, gr2)
def test_read_json_classic_file(self): data = self.abs_path_join(__file__, 'data', 'classic.json') dfs = pandas.read_json(data, orient='records') self.assertEqual(dfs.shape[1], 8) self.assertGreater(dfs.shape[0], 2) with open(data, "r", encoding="utf-8") as f: it = StreamingDataFrame.read_json(f, orient='records') h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2) self.assertEqual(h1.shape[1], 8)
def test_set_item(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) self.assertRaise(lambda: StreamingDataFrame(df), TypeError) sdf = StreamingDataFrame.read_df(df) def f(): sdf[['a']] = 10 self.assertRaise(f, ValueError) def g(): sdf['a'] = [10] self.assertRaise(g, NotImplementedError) sdf['aa'] = 10 df = sdf.to_df() ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10])) self.assertEqualDataFrame(df, ddf) sdf['bb'] = sdf['b'] + 10 df = sdf.to_df() ddf = ddf = pandas.DataFrame( data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16])) self.assertEqualDataFrame(df, ddf)
def test_sort_values_reverse(self): temp = get_temp_folder(__file__, "temp_sort_values_reverse") name = os.path.join(temp, "_data_") df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), dict(a=5, b="f", c=5.7, ind="a2", ai=2), dict(a=4, b="g", ind="a3", ai=3), dict(a=8, b="h", c=5.9, ai=4), dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) sdf = StreamingDataFrame.read_df(df, chunksize=2) sorted_df = df.sort_values(by="a", ascending=False) res = sdf.sort_values(by="a", temp_file=name, ascending=False) res_df = res.to_df() self.assertEqualDataFrame(sorted_df, res_df)
def test_read_json_classic(self): data = self.abs_path_join(__file__, 'data', 'classic.json') dfs = pandas.read_json(data, orient='records') dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) self.assertEqual(dfs.shape[1], 9) self.assertGreater(dfs.shape[0], 2) it = StreamingDataFrame.read_json(data) it['ts2'] = it['ts'].apply(lambda t: t / 1e9) h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2) self.assertEqual(h1.shape[1], 9)
def test_train_test_split_streaming_tiny(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df])) sdftr, sdfte = sdf2.train_test_split(test_size=0.5) df1 = sdfte.head() df2 = sdfte.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) df1 = sdftr.head() df2 = sdftr.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.concat(sdf, axis=0) sdftr, sdfte = sdf2.train_test_split(test_size=0.5) df1 = sdfte.head() df2 = sdfte.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) df1 = sdftr.head() df2 = sdftr.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2)
def test_fillna(self): df = pandas.DataFrame( data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan])) sdf = StreamingDataFrame.read_df(df) df2 = pandas.DataFrame( data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) na = sdf.fillna(value=dict(X=10.0, Y="NAN")) ndf = na.to_df() self.assertEqual(ndf, df2) df3 = pandas.DataFrame( data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) na = sdf.fillna(value=dict(X=10.0)) ndf = na.to_df() self.assertEqual(ndf, df3)
def test_describe(self): x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5 y = numpy.arange(100001).astype(numpy.int64) z = numpy.array([chr(65 + j % 45) for j in y]) df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z)) sdf = StreamingDataFrame.read_df(df) desc = sdf.describe() self.assertEqual(['X', 'Y'], list(desc.columns)) self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0]) self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000]) self.assertEqualArray(desc.loc['mean', :], numpy.array([0, 50000])) self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000])) self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000])) self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000])) self.assertEqualArray(desc.loc['std', :], numpy.array( [2.886795e-01, 28867.946472]), decimal=4)
def test_read_json_file2(self): data = b'''{"a": {"c": 1}, "b": [2, 3]} {"a": {"a": 3}, "b": [4, 5, "r"]}''' obj1 = list( enumerate_json_items(BytesIO(data), flatten=False, lines=True)) obj2 = list( enumerate_json_items(BytesIO(data), flatten=True, lines=True)) self.assertNotEqual(obj1, obj2) self.assertEqual(obj2, [{ 'a_c': 1, 'b_0': 2, 'b_1': 3 }, { 'a_a': 3, 'b_0': 4, 'b_1': 5, 'b_2': 'r' }]) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True) dfs = list(it) self.assertEqual( ['a_a', 'a_c', 'b_0', 'b_1', 'b_2'], list(sorted(dfs[0].columns)), ) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records', lines=True) jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') exp = [{ 'a_a': None, 'a_c': 1.0, 'b_0': 2, 'b_1': 3, 'b_2': None }, { 'a_a': 3.0, 'a_c': None, 'b_0': 4, 'b_1': 5, 'b_2': 'r' }] self.assertEqual(exp, jsjson)
def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) self.assertRaise( lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError) self.assertRaise( lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError) StreamingDataFrame.read_str(tr.encode('utf-8')) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_init(self): sdf = dummy_streaming_dataframe(100) df1 = sdf.to_df() sdf2 = StreamingDataFrame(sdf) df2 = sdf2.to_df() self.assertEqualDataFrame(df1, df2)