Beispiel #1
0
def test_batch_read_dataframes_from_json_of_orients(df, orient):
    test_datas = [df.to_json(orient=orient).encode()] * 3
    test_types = ['application/json'] * 3
    df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types, orient)

    df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types, orient)
    for s in slices:
        assert_df_equal(df_merged[s], df)
Beispiel #2
0
def test_benchmark_load_dataframes():
    '''
    read_dataframes_from_json_n_csv should be 30x faster than pd.read_json + pd.concat
    '''
    test_count = 50

    dfs = [pd.DataFrame(np.random.rand(10, 100)) for _ in range(test_count)]
    inputs = [df.to_json().encode() for df in dfs]

    time_st = time.time()
    dfs = [pd.read_json(i) for i in inputs]
    result1 = pd.concat(dfs)
    time1 = time.time() - time_st

    time_st = time.time()
    result2, _ = read_dataframes_from_json_n_csv(
        inputs, itertools.repeat('json'), 'columns'
    )

    time2 = time.time() - time_st
    assert_df_equal(result1, result2)

    # 5 is just an estimate on the smaller end, which should be true for most
    # development machines and Github actions CI environment, the actual ratio depends
    # on the hardware and available computing resource
    assert time1 / time2 > 5
def test_batch_read_dataframes_from_json_n_csv():
    for df in (
            pd.DataFrame(np.random.rand(2, 3)),
            pd.DataFrame(["str1", "str2", "str3"]),  # single dim sting array
            pd.DataFrame([np.nan]),  # special values
            pd.DataFrame([math.nan]),  # special values
            pd.DataFrame([" "]),  # special values
            # pd.DataFrame([""]),  # TODO: -> NaN
    ):
        csv_str = df.to_json()
        list_str = json.dumps(df.to_numpy().tolist())
        test_datas = ([csv_str.encode()] * 20 + [list_str.encode()] * 20 +
                      [df.to_csv().encode()] * 20 +
                      [df.to_csv(index=False).encode()] * 20)

        test_types = (['application/json'] * 20 + ['application/json'] * 20 +
                      ['text/csv'] * 20 + ['text/csv'] * 20)

        df_merged, slices = read_dataframes_from_json_n_csv(
            test_datas, test_types)
        for s in slices:
            left = df_merged[s].values
            right = df.values
            if right.dtype == np.float:
                np.testing.assert_array_almost_equal(left, right)
            else:
                np.testing.assert_array_equal(left, right)
Beispiel #4
0
def test_batch_read_dataframes_from_json_with_wrong_orients(df, orient):
    test_datas = [df.to_json(orient='table').encode()] * 3
    test_types = ['json'] * 3

    df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types, orient)
    assert not df_merged
    for count in counts:
        assert not count
Beispiel #5
0
def test_batch_read_dataframes_from_json_of_orients(df, orient):
    test_datas = [df.to_json(orient=orient).encode()] * 3
    test_types = ['json'] * 3
    df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types, orient)
    i = 0
    for count in counts:
        assert_df_equal(df_merged[i : i + count], df)
        i += count
def test_batch_read_dataframes_from_json_in_mixed_order():
    # different column order when orient=records
    df_json = b'[{"A": 1, "B": 2, "C": 3}, {"C": 6, "A": 2, "B": 4}]'
    df_merged, slices = read_dataframes_from_json_n_csv([df_json],
                                                        ['application/json'])
    for s in slices:
        assert_df_equal(df_merged[s], pd.read_json(df_json))

    # different row/column order when orient=columns
    df_json1 = b'{"A": {"1": 1, "2": 2}, "B": {"1": 2, "2": 4}, "C": {"1": 3, "2": 6}}'
    df_json2 = b'{"B": {"1": 2, "2": 4}, "A": {"1": 1, "2": 2}, "C": {"1": 3, "2": 6}}'
    df_json3 = b'{"A": {"1": 1, "2": 2}, "B": {"2": 4, "1": 2}, "C": {"1": 3, "2": 6}}'
    df_merged, slices = read_dataframes_from_json_n_csv(
        [df_json1, df_json2, df_json3], ['application/json'] * 3)
    for s in slices:
        assert_df_equal(df_merged[s][["A", "B", "C"]],
                        pd.read_json(df_json1)[["A", "B", "C"]])
def test_batch_read_dataframes_from_csv_other_CRLF(df):
    csv_str = df.to_csv(index=False)
    if '\r\n' in csv_str:
        csv_str = '\n'.join(_csv_split(csv_str, '\r\n')).encode()
    else:
        csv_str = '\r\n'.join(_csv_split(csv_str, '\n')).encode()
    df_merged, _ = read_dataframes_from_json_n_csv([csv_str], ['text/csv'])
    assert_df_equal(df_merged, df)
Beispiel #8
0
def test_batch_read_dataframes_from_csv_other_CRLF(df):
    csv_str = df.to_csv(index=False)

    if '\r\n' in csv_str:
        csv_str = '\n'.join(csv_splitlines(csv_str))
    else:
        csv_str = '\r\n'.join(csv_splitlines(csv_str))
    df_merged, _ = read_dataframes_from_json_n_csv([csv_str], ['csv'])
    assert_df_equal(df_merged, df)
Beispiel #9
0
def test_batch_read_dataframes_from_json_in_mixed_order():
    # different column order when orient=records
    df_json = b'[{"A": 1, "B": 2, "C": 3}, {"C": 6, "A": 2, "B": 4}]'
    df_merged, counts = read_dataframes_from_json_n_csv([df_json], ['json'])
    i = 0
    for count in counts:
        assert_df_equal(df_merged[i : i + count], pd.read_json(df_json))
        i += count

    # different row/column order when orient=columns
    df_json1 = b'{"A": {"1": 1, "2": 2}, "B": {"1": 2, "2": 4}, "C": {"1": 3, "2": 6}}'
    df_json2 = b'{"B": {"1": 2, "2": 4}, "A": {"1": 1, "2": 2}, "C": {"1": 3, "2": 6}}'
    df_json3 = b'{"A": {"1": 1, "2": 2}, "B": {"2": 4, "1": 2}, "C": {"1": 3, "2": 6}}'
    df_merged, counts = read_dataframes_from_json_n_csv(
        [df_json1, df_json2, df_json3], ['json'] * 3
    )
    i = 0
    for count in counts:
        assert_df_equal(
            df_merged[i : i + count][["A", "B", "C"]],
            pd.read_json(df_json1)[["A", "B", "C"]],
        )
        i += count
def test_batch_read_dataframes_from_mixed_json_n_csv(df):
    test_datas = []
    test_types = []

    # test content_type=application/json with various orients
    for orient in pytest.DF_ORIENTS:
        try:
            assert_df_equal(df, pd.read_json(df.to_json(orient=orient)))
        except (AssertionError, ValueError):
            # skip cases not supported by official pandas
            continue

        test_datas.extend([df.to_json(orient=orient).encode()] * 3)
        test_types.extend(['application/json'] * 3)
        df_merged, slices = read_dataframes_from_json_n_csv(
            test_datas, test_types, orient=None)  # auto detect orient

    test_datas.extend([df.to_csv(index=False).encode()] * 3)
    test_types.extend(['text/csv'] * 3)

    df_merged, slices = read_dataframes_from_json_n_csv(test_datas, test_types)
    for s in slices:
        assert_df_equal(df_merged[s], df)
Beispiel #11
0
def test_benchmark_load_dataframes():
    '''
    read_dataframes_from_json_n_csv should be 30x faster than pd.read_json + pd.concat
    '''
    test_count = 50

    dfs = [pd.DataFrame(np.random.rand(10, 100)) for _ in range(test_count)]
    inputs = [df.to_json().encode() for df in dfs]

    time_st = time.time()
    dfs = [pd.read_json(i) for i in inputs]
    result1 = pd.concat(dfs)
    time1 = time.time() - time_st

    time_st = time.time()
    result2, _ = read_dataframes_from_json_n_csv(
        inputs, itertools.repeat('application/json'), 'columns')
    time2 = time.time() - time_st

    assert_df_equal(result1, result2)
    assert time1 / time2 > 20
Beispiel #12
0
def test_batch_read_dataframes_from_mixed_json_n_csv(df):
    test_datas = []
    test_types = []

    # test content_type=application/json with various orients
    for orient in pytest.DF_ORIENTS:
        try:
            assert_df_equal(df, pd.read_json(df.to_json(orient=orient)))
        except (AssertionError, ValueError):
            # skip cases not supported by official pandas
            continue

        test_datas.extend([df.to_json(orient=orient)] * 3)
        test_types.extend(['json'] * 3)

    test_datas.extend([df.to_csv(index=False)] * 3)
    test_types.extend(['csv'] * 3)

    df_merged, counts = read_dataframes_from_json_n_csv(test_datas, test_types)
    i = 0
    for count in counts:
        assert_df_equal(df_merged[i : i + count], df)
        i += count
def test_batch_read_dataframes_from_json_with_wrong_orients(df, orient):
    test_datas = [df.to_json(orient='table').encode()] * 3
    test_types = ['application/json'] * 3

    with pytest.raises(BadInput):
        read_dataframes_from_json_n_csv(test_datas, test_types, orient)