def test_int_jsonl(in_type, pd_old_type, pd_new_type): """Testing jsonl mapping from pyarrow to Pandas data types. Args: in_type (str): pyarrow data type mapping, as expected to be read in from the jsonl. pd_old_type (str): old pandas data type mapping. pd_new_type (str): new pandas data type mapping. """ schema = pa.schema([("int_col", getattr(pa, in_type)())]) test_file = "tests/data/int_type.jsonl" df_old = pa_read_json_to_pandas(test_file, schema, False, pd_integer=False) assert str(df_old.my_int.dtype) == pd_old_type df_new = pa_read_json_to_pandas(test_file, schema, False, pd_integer=True) assert str(df_new.my_int.dtype) == pd_new_type
def test_bool_csv_and_json(): schema = pa.schema([("i", pa.int8()), ("my_bool", pa.bool_()), ("my_nullable_bool", pa.bool_())]) df_csv = pa_read_csv_to_pandas("tests/data/bool_type.csv", schema, pd_boolean=True) df_jsonl = pa_read_json_to_pandas("tests/data/bool_type.jsonl", schema, pd_boolean=True) assert df_csv.equals(df_jsonl)
def test_file_reader_works_with_schema(): csv_schema = pa.schema([("test", pa.string()), ("a_column", pa.string())]) df_csv = pa_read_csv_to_pandas("tests/data/example_data.csv") df_csv_schema = pa_read_csv_to_pandas("tests/data/example_data.csv", csv_schema) assert_frame_equal(df_csv, df_csv_schema) json_schema = pa.schema([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string()), ("d", pa.bool_())]) df_json = pa_read_json_to_pandas("tests/data/example_data.jsonl") df_json_schema = pa_read_json_to_pandas("tests/data/example_data.jsonl", json_schema) assert_frame_equal(df_json, df_json_schema) # Check raises error on both readers missing_schema = pa.schema([("b", pa.float64()), ("c", pa.string()), ("d", pa.bool_())]) with pytest.raises(ValueError): pa_read_json_to_pandas("tests/data/example_data.jsonl", missing_schema) with pytest.raises(ValueError): pa_read_csv_to_pandas("tests/data/example_data.csv", missing_schema)
def test_timestamps_as_strs(): test_data_path = "tests/data/datetime_type.csv" test_str_dates = pd.read_csv(test_data_path, dtype="string")["my_datetime"] schema = pa.schema([("my_datetime", pa.string())]) df = pa_read_csv_to_pandas(test_data_path, schema, expect_full_schema=False) assert df["my_datetime"].to_list() == test_str_dates.to_list() df = pa_read_json_to_pandas(test_data_path.replace(".csv", ".jsonl"), schema, expect_full_schema=False) assert df["my_datetime"].to_list() == test_str_dates.to_list()
def test_decimal_float(arrow_type, pd_type): type_lu = { "float32": pa.float32(), "float64": pa.float64(), "decimal": pa.decimal128(5, 3), } schema = pa.schema([("i", pa.int8()), ("my_decimal", type_lu[arrow_type])]) df_csv = pa_read_csv_to_pandas("tests/data/decimal_type.csv", schema) df_json = pa_read_json_to_pandas("tests/data/decimal_type.jsonl", schema) assert str(df_csv.my_decimal.dtype) == pd_type assert str(df_json.my_decimal.dtype) == pd_type assert_frame_equal(df_csv, df_json)
def test_pd_to_json(boolean_args, date_args, schema): original = pa_read_csv_to_pandas( "tests/data/all_types.csv", schema, pd_boolean=boolean_args, pd_integer=boolean_args, pd_string=boolean_args, pd_date_type=date_args, pd_timestamp_type=date_args, ) # Write to StringIO then convert to BytesIO so Arrow can read it output = io.StringIO() pd_to_json(original, output) as_bytes = io.BytesIO(bytearray(output.getvalue(), "utf-8")) reloaded = pa_read_json_to_pandas( as_bytes, schema, pd_boolean=boolean_args, pd_integer=boolean_args, pd_string=boolean_args, pd_date_type=date_args, pd_timestamp_type=date_args, ) assert_frame_equal(original, reloaded)
def test_file_reader_returns_df(): df = pa_read_csv_to_pandas("tests/data/example_data.csv") assert isinstance(df, pd.DataFrame) df = pa_read_json_to_pandas("tests/data/example_data.jsonl") assert isinstance(df, pd.DataFrame)