def test_groupby_streaming(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming("key",
                                   lambda gr: gr.sum(),
                                   strategy='streaming',
                                   as_index=False)
     gr2 = df20.groupby("key", as_index=False).sum()
     grs = [gr for gr in sgr]
     gr = pandas.concat(grs).groupby("key", as_index=False).sum()
     self.assertEqualDataFrame(gr, gr2)
 def test_train_test_split(self):
     sdf = dummy_streaming_dataframe(100)
     tr, te = sdf.train_test_split(index=False, streaming=False)
     trsdf = StreamingDataFrame.read_str(tr)
     tesdf = StreamingDataFrame.read_str(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_groupby_cum_asindex(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming(
         "key", lambda gr: gr.sum(), strategy='cum', as_index=True)
     gr2 = df20.groupby("key", as_index=True).sum()
     lastgr = None
     for gr in sgr:
         self.assertEqual(list(gr.columns), list(gr2.columns))
         lastgr = gr
     self.assertEqualDataFrame(lastgr, gr2)
 def test_sample_reservoir_cache(self):
     sdf = dummy_streaming_dataframe(100)
     res = sdf.sample(n=10, cache=True, reservoir=True)
     df1 = res.to_df()
     df2 = res.to_df()
     self.assertEqualDataFrame(df1, df2)
     self.assertEqual(df1.shape, (10, res.shape[1]))
     self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True),
                      ValueError)
     self.assertRaise(
         lambda: sdf.sample(frac=0.1, cache=True, reservoir=True),
         ValueError)
    def test_concatv(self):
        sdf20 = dummy_streaming_dataframe(20)
        sdf30 = dummy_streaming_dataframe(30)
        df20 = sdf20.to_dataframe()
        df30 = sdf30.to_dataframe()
        df = pandas.concat([df20, df30], axis=0)

        m1 = sdf20.concat(sdf30, axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(df30, axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)

        df30["g"] = 4
        self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(),
                         ValueError, "Frame others[0] do not have the same column names")
        df20["cint"] = df20["cint"].astype(float)
        self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(),
                         ValueError, "Frame others[0] do not have the same column types")
 def test_where(self):
     sdf = dummy_streaming_dataframe(100)
     cols = sdf.columns
     self.assertEqual(list(cols), ['cint', 'cstr'])
     dts = sdf.dtypes
     self.assertEqual(len(dts), 2)
     res = sdf.where(lambda row: row["cint"] == 1)
     st = res.to_csv()
     self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1",
                           st.replace('\r', ''))
     res = sdf.where(lambda row: row["cint"] == 1)
     st = res.to_csv()
     self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1",
                           st.replace('\r', ''))
    def test_merge(self):
        def compares(a, b, how):
            m = a.merge(b, on="cint", indicator=True)
            dm = m.to_dataframe()
            da = a.to_dataframe()
            db = b.to_dataframe()
            exp = da.merge(db, on="cint", indicator=True)
            self.assertEqualDataFrame(dm.reset_index(drop=True),
                                      exp.reset_index(drop=True))

        sdf20 = dummy_streaming_dataframe(20)
        sdf30 = dummy_streaming_dataframe(30)
        # itself
        hows = "inner left right outer".split()
        for how in hows:
            compares(sdf20, sdf20, how)
            compares(sdf20, sdf20, how)
        for how in hows:
            compares(sdf20, sdf30, how)
            compares(sdf20, sdf30, how)
        for how in hows:
            compares(sdf30, sdf20, how)
            compares(sdf30, sdf20, how)
        sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
 def test_train_test_split_file_pattern(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern")
     sdf = dummy_streaming_dataframe(100)
     names = os.path.join(temp, "spl_{0}.txt")
     self.assertRaise(lambda: sdf.train_test_split(
         names, index=False, streaming=False), ValueError)
     names = os.path.join(temp, "spl_{}.txt")
     tr, te = sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(tr)
     tesdf = StreamingDataFrame.read_csv(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_train_test_split_file(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file")
     names = [os.path.join(temp, "train.txt"),
              os.path.join(temp, "test.txt")]
     sdf = dummy_streaming_dataframe(100)
     sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(names[0])
     tesdf = StreamingDataFrame.read_csv(names[1])
     self.assertGreater(trsdf.shape[0], 20)
     self.assertGreater(tesdf.shape[0], 20)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     self.assertGreater(trdf.shape[0], 20)
     self.assertGreater(tedf.shape[0], 20)
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_train_test_split(self):
     sdf = dummy_streaming_dataframe(100)
     tr, te = sdf.train_test_split(index=False, streaming=False)
     self.assertRaise(
         lambda: StreamingDataFrame.read_str(tr, chunksize=None),
         ValueError)
     self.assertRaise(
         lambda: StreamingDataFrame.read_str(tr, iterator=False),
         ValueError)
     StreamingDataFrame.read_str(tr.encode('utf-8'))
     trsdf = StreamingDataFrame.read_str(tr)
     tesdf = StreamingDataFrame.read_str(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_train_test_split_streaming(self):
     sdf = dummy_streaming_dataframe(100, asfloat=True)
     trsdf, tesdf = sdf.train_test_split(
         streaming=True, unique_rows=True, partitions=[0.7, 0.3])
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trdf2 = trsdf.to_dataframe()
     tedf2 = tesdf.to_dataframe()
     df_val = pandas.concat([trdf2, tedf2])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     self.assertEqual(trdf.shape, trdf2.shape)
     self.assertEqual(tedf.shape, tedf2.shape)
     self.assertGreater(trdf.shape[0], tedf.shape[0])
     self.assertGreater(trdf2.shape[0], tedf2.shape[0])
 def test_train_test_split_streaming_strat(self):
     sdf = dummy_streaming_dataframe(100, asfloat=True,
                                     tify=["t1" if i % 3 else "t0" for i in range(0, 100)])
     trsdf, tesdf = sdf.train_test_split(
         streaming=True, unique_rows=True, stratify="tify")
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trgr = trdf.groupby("tify").count()
     trgr["part"] = 0
     tegr = tedf.groupby("tify").count()
     tegr["part"] = 1
     gr = pandas.concat([trgr, tegr])
     self.assertGreater(gr['cfloat'].min(), 4)
 def test_init(self):
     sdf = dummy_streaming_dataframe(100)
     df1 = sdf.to_df()
     sdf2 = StreamingDataFrame(sdf)
     df2 = sdf2.to_df()
     self.assertEqualDataFrame(df1, df2)
 def test_tail(self):
     sdf = dummy_streaming_dataframe(100)
     st = sdf.tail()
     self.assertEqual(st.shape, (5, 2))
     st = sdf.tail(n=20)
     self.assertEqual(st.shape, (10, 2))
 def test_iterrows(self):
     sdf = dummy_streaming_dataframe(100)
     rows = list(sdf.iterrows())
     self.assertEqual(sdf.shape[0], len(rows))
     rows = list(sdf.iterrows())
     self.assertEqual(sdf.shape[0], len(rows))
 def test_to_csv(self):
     sdf = dummy_streaming_dataframe(100)
     st = sdf.to_csv()
     self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace('\r', ''))
     st = sdf.to_csv()
     self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace('\r', ''))
 def test_dataframe(self):
     sdf = dummy_streaming_dataframe(100)
     df = sdf.to_dataframe()
     self.assertEqual(df.shape, (100, 2))