def test_int_jsonl(in_type, pd_old_type, pd_new_type):
    """Testing jsonl mapping from pyarrow to Pandas data types.

    Args:
        in_type (str): pyarrow data type mapping,
            as expected to be read in from the jsonl.
        pd_old_type (str): old pandas data type mapping.
        pd_new_type (str): new pandas data type mapping.
    """
    schema = pa.schema([("int_col", getattr(pa, in_type)())])
    test_file = "tests/data/int_type.jsonl"

    df_old = pa_read_json_to_pandas(test_file, schema, False, pd_integer=False)
    assert str(df_old.my_int.dtype) == pd_old_type

    df_new = pa_read_json_to_pandas(test_file, schema, False, pd_integer=True)
    assert str(df_new.my_int.dtype) == pd_new_type
コード例 #2
0
def test_bool_csv_and_json():
    schema = pa.schema([("i", pa.int8()), ("my_bool", pa.bool_()),
                        ("my_nullable_bool", pa.bool_())])
    df_csv = pa_read_csv_to_pandas("tests/data/bool_type.csv",
                                   schema,
                                   pd_boolean=True)
    df_jsonl = pa_read_json_to_pandas("tests/data/bool_type.jsonl",
                                      schema,
                                      pd_boolean=True)
    assert df_csv.equals(df_jsonl)
コード例 #3
0
def test_file_reader_works_with_schema():
    csv_schema = pa.schema([("test", pa.string()), ("a_column", pa.string())])
    df_csv = pa_read_csv_to_pandas("tests/data/example_data.csv")
    df_csv_schema = pa_read_csv_to_pandas("tests/data/example_data.csv",
                                          csv_schema)
    assert_frame_equal(df_csv, df_csv_schema)

    json_schema = pa.schema([("a", pa.int64()), ("b", pa.float64()),
                             ("c", pa.string()), ("d", pa.bool_())])
    df_json = pa_read_json_to_pandas("tests/data/example_data.jsonl")
    df_json_schema = pa_read_json_to_pandas("tests/data/example_data.jsonl",
                                            json_schema)
    assert_frame_equal(df_json, df_json_schema)

    # Check raises error on both readers
    missing_schema = pa.schema([("b", pa.float64()), ("c", pa.string()),
                                ("d", pa.bool_())])
    with pytest.raises(ValueError):
        pa_read_json_to_pandas("tests/data/example_data.jsonl", missing_schema)
    with pytest.raises(ValueError):
        pa_read_csv_to_pandas("tests/data/example_data.csv", missing_schema)
def test_timestamps_as_strs():
    test_data_path = "tests/data/datetime_type.csv"
    test_str_dates = pd.read_csv(test_data_path, dtype="string")["my_datetime"]

    schema = pa.schema([("my_datetime", pa.string())])
    df = pa_read_csv_to_pandas(test_data_path,
                               schema,
                               expect_full_schema=False)
    assert df["my_datetime"].to_list() == test_str_dates.to_list()

    df = pa_read_json_to_pandas(test_data_path.replace(".csv", ".jsonl"),
                                schema,
                                expect_full_schema=False)
    assert df["my_datetime"].to_list() == test_str_dates.to_list()
コード例 #5
0
def test_decimal_float(arrow_type, pd_type):

    type_lu = {
        "float32": pa.float32(),
        "float64": pa.float64(),
        "decimal": pa.decimal128(5, 3),
    }

    schema = pa.schema([("i", pa.int8()), ("my_decimal", type_lu[arrow_type])])

    df_csv = pa_read_csv_to_pandas("tests/data/decimal_type.csv", schema)
    df_json = pa_read_json_to_pandas("tests/data/decimal_type.jsonl", schema)

    assert str(df_csv.my_decimal.dtype) == pd_type
    assert str(df_json.my_decimal.dtype) == pd_type

    assert_frame_equal(df_csv, df_json)
コード例 #6
0
def test_pd_to_json(boolean_args, date_args, schema):
    original = pa_read_csv_to_pandas(
        "tests/data/all_types.csv",
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )
    # Write to StringIO then convert to BytesIO so Arrow can read it
    output = io.StringIO()
    pd_to_json(original, output)
    as_bytes = io.BytesIO(bytearray(output.getvalue(), "utf-8"))
    reloaded = pa_read_json_to_pandas(
        as_bytes,
        schema,
        pd_boolean=boolean_args,
        pd_integer=boolean_args,
        pd_string=boolean_args,
        pd_date_type=date_args,
        pd_timestamp_type=date_args,
    )
    assert_frame_equal(original, reloaded)
コード例 #7
0
def test_file_reader_returns_df():
    df = pa_read_csv_to_pandas("tests/data/example_data.csv")
    assert isinstance(df, pd.DataFrame)

    df = pa_read_json_to_pandas("tests/data/example_data.jsonl")
    assert isinstance(df, pd.DataFrame)