コード例 #1
0
def test_read_multiple_csv_s3_storage_opts(storage_options):
    path = "s3://modin-datasets/testing/multiple_csv/"
    # Test the fact of handling of `storage_options`
    modin_df = pd.read_csv_glob(path, storage_options=storage_options)
    pandas_df = pd.concat([
        pandas.read_csv(
            f"{path}test_data{i}.csv",
            storage_options=storage_options,
        ) for i in range(2)
    ], ).reset_index(drop=True)

    df_equals(modin_df, pandas_df)
コード例 #2
0
def test_read_multiple_csv_s3():
    modin_df = pd.read_csv_glob("S3://noaa-ghcn-pds/csv/178*.csv")

    # We have to specify the columns because the column names are not identical. Since we specified the column names, we also have to skip the original column names.
    pandas_dfs = [
        pandas.read_csv(
            "s3://noaa-ghcn-pds/csv/178{}.csv".format(i),
            names=modin_df.columns,
            skiprows=[0],
        ) for i in range(10)
    ]
    pandas_df = pd.concat(pandas_dfs)

    # Indexes get messed up when concatting so we reset both.
    pandas_df = pandas_df.reset_index(drop=True)
    modin_df = modin_df.reset_index(drop=True)

    df_equals(modin_df, pandas_df)