def test_read_csv_glob_4373(self): columns, filename = ["col0"], "1x1.csv" pd.DataFrame([[1]], columns=columns).to_csv(filename) kwargs = {"filepath_or_buffer": filename, "usecols": columns} modin_df = pd.read_csv_glob(**kwargs) pandas_df = pandas.read_csv(**kwargs) df_equals(modin_df, pandas_df)
def test_read_single_csv_with_parse_dates(self, parse_dates): try: pandas_df = pandas.read_csv(time_parsing_csv_path, parse_dates=parse_dates) except Exception as pandas_exception: with pytest.raises(Exception) as modin_exception: modin_df = pd.read_csv_glob(time_parsing_csv_path, parse_dates=parse_dates) # Call __repr__ on the modin df to force it to materialize. repr(modin_df) assert isinstance( modin_exception.value, type(pandas_exception) ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( type(modin_exception.value), type(pandas_exception)) else: modin_df = pd.read_csv_glob(time_parsing_csv_path, parse_dates=parse_dates) df_equals(modin_df, pandas_df)
def test_read_multiple_small_csv(self): # noqa: F811 pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files]) modin_df = pd.read_csv_glob(pytest.glob_path) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df)
def test_read_csv_empty_frame(self): kwargs = { "usecols": [0], "index_col": 0, } modin_df = pd.read_csv_glob(pytest.files[0], **kwargs) pandas_df = pandas.read_csv(pytest.files[0], **kwargs) df_equals(modin_df, pandas_df)
def test_read_multiple_csv_nrows(self, request, nrows): # noqa: F811 pandas_df = pandas.concat([pandas.read_csv(fname) for fname in pytest.files]) pandas_df = pandas_df.iloc[:nrows, :] modin_df = pd.read_csv_glob(pytest.glob_path, nrows=nrows) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df)
def test_read_multiple_csv_s3_storage_opts(storage_options): path = "s3://modin-datasets/testing/multiple_csv/" # Test the fact of handling of `storage_options` modin_df = pd.read_csv_glob(path, storage_options=storage_options) pandas_df = pd.concat([ pandas.read_csv( f"{path}test_data{i}.csv", storage_options=storage_options, ) for i in range(2) ], ).reset_index(drop=True) df_equals(modin_df, pandas_df)
def test_read_multiple_csv_s3(): modin_df = pd.read_csv_glob("S3://noaa-ghcn-pds/csv/178*.csv") # We have to specify the columns because the column names are not identical. Since we specified the column names, we also have to skip the original column names. pandas_dfs = [ pandas.read_csv( "s3://noaa-ghcn-pds/csv/178{}.csv".format(i), names=modin_df.columns, skiprows=[0], ) for i in range(10) ] pandas_df = pd.concat(pandas_dfs) # Indexes get messed up when concatting so we reset both. pandas_df = pandas_df.reset_index(drop=True) modin_df = modin_df.reset_index(drop=True) df_equals(modin_df, pandas_df)
def test_read_csv_without_glob(self): with pytest.warns(UserWarning, match=r"Shell-style wildcard"): with pytest.raises(FileNotFoundError): pd.read_csv_glob( "s3://nyc-tlc/trip data/yellow_tripdata_2020-")