def test_read_csv(self): temp = get_temp_folder(__file__, "temp_read_csv") df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) name = os.path.join(temp, "df.csv") name2 = os.path.join(temp, "df2.csv") name3 = os.path.join(temp, "df3.csv") df.to_csv(name, index=False) df.to_csv(name2, index=True) sdf = StreamingDataFrame.read_csv(name) text = sdf.to_csv(index=False) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, chunksize=None), ValueError) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, iterator=False), ValueError) sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) text2 = sdf2.to_csv(index=True) sdf2.to_csv(name3, index=True) with open(name, "r", encoding='utf-8') as f: exp = f.read() with open(name2, "r", encoding='utf-8') as f: exp2 = f.read() with open(name3, "r", encoding='utf-8') as f: text3 = f.read() self.assertEqual(text.replace('\r', ''), exp) sdf2 = StreamingDataFrame.read_df(df) self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) self.assertEqual(text2.replace('\r', ''), exp2) self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'), exp2.replace('\r', ''))
def test_schema_consistant(self): df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"), dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")]) temp = get_temp_folder(__file__, "temp_schema_consistant") name = os.path.join(temp, "df.csv") stio = StringIO() df.to_csv(stio, index=False) self.assertNotEmpty(stio.getvalue()) df.to_csv(name, index=False) self.assertEqual(df.shape, (4, 3)) sdf = StreamingDataFrame.read_csv(name, chunksize=2) self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) sdf = StreamingDataFrame.read_csv( name, chunksize=2, check_schema=False) pieces = list(sdf) self.assertEqual(len(pieces), 2)
def test_train_test_split_file_pattern(self): temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") sdf = dummy_streaming_dataframe(100) names = os.path.join(temp, "spl_{0}.txt") self.assertRaise(lambda: sdf.train_test_split( names, index=False, streaming=False), ValueError) names = os.path.join(temp, "spl_{}.txt") tr, te = sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(tr) tesdf = StreamingDataFrame.read_csv(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_read_csv_names(self): this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data", "buggy_hash2.csv") df = pandas.read_csv(data, sep="\t", names=[ "A", "B", "C"], header=None) sdf = StreamingDataFrame.read_csv( data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None) head = sdf.head(n=1) self.assertEqualDataFrame(df.head(n=1), head)
def test_train_test_split_file(self): temp = get_temp_folder(__file__, "temp_train_test_split_file") names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] sdf = dummy_streaming_dataframe(100) sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(names[0]) tesdf = StreamingDataFrame.read_csv(names[1]) self.assertGreater(trsdf.shape[0], 20) self.assertGreater(tesdf.shape[0], 20) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() self.assertGreater(trdf.shape[0], 20) self.assertGreater(tedf.shape[0], 20) df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)