def test_categorical(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp, engine=write_engine) ddf2 = dd.read_parquet(tmp, categories='x', engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x'], engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat if read_engine != 'pyarrow': ddf2 = dd.read_parquet(tmp, engine=read_engine) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[], engine=read_engine) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_categorical(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) dd.to_parquet(ddf, tmp) ddf2 = dd.read_parquet(tmp, categories='x') assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2 = dd.read_parquet(tmp, categories=['x']) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] # autocat ddf2 = dd.read_parquet(tmp) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2) # dereference cats ddf2 = dd.read_parquet(tmp, categories=[]) ddf2.loc[:1000].compute() assert (df.x == ddf2.x).all()
def test_filters(tmpdir, write_engine, read_engine): fn = str(tmpdir) df = pd.DataFrame({'at': ['ab', 'aa', 'ba', 'da', 'bb']}) ddf = dd.from_pandas(df, npartitions=1) # Ok with 1 partition and filters ddf.repartition(npartitions=1, force=True).to_parquet(fn, write_index=False, engine=write_engine) ddf2 = dd.read_parquet(fn, index=False, engine=read_engine, filters=[('at', '==', 'aa')]).compute() assert_eq(ddf2, ddf) # with >1 partition and no filters ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) dd.read_parquet(fn, engine=read_engine).compute() assert_eq(ddf2, ddf) # with >1 partition and filters using base fastparquet if read_engine == 'fastparquet': ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) df2 = fastparquet.ParquetFile(fn).to_pandas(filters=[('at', '==', 'aa')]) assert len(df2) > 0 # with >1 partition and filters ddf.repartition(npartitions=2, force=True).to_parquet(fn, engine=write_engine) dd.read_parquet(fn, engine=read_engine, filters=[('at', '==', 'aa')]).compute() assert len(ddf2) > 0
def test_writing_parquet_with_kwargs(tmpdir, engine): fn = str(tmpdir) path1 = os.path.join(fn, 'normal') path2 = os.path.join(fn, 'partitioned') pytest.importorskip("snappy") df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100)}) ddf = dd.from_pandas(df, npartitions=3) engine_kwargs = { 'pyarrow': { 'compression': 'snappy', 'coerce_timestamps': None, 'use_dictionary': True }, 'fastparquet': { 'compression': 'snappy', 'times': 'int64', 'fixed_text': None } } ddf.to_parquet(path1, engine=engine, **engine_kwargs[engine]) out = dd.read_parquet(path1, engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(out, ddf, check_index=(engine != 'fastparquet'), check_divisions=should_check_divs(engine)) # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets with dask.config.set(scheduler='sync'): ddf.to_parquet(path2, engine=engine, partition_on=['a'], **engine_kwargs[engine]) out = dd.read_parquet(path2, engine=engine).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val])
def test_read_from_fastparquet_parquetfile(tmpdir): check_fastparquet() fn = str(tmpdir) df = pd.DataFrame({ 'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100) }) d = dd.from_pandas(df, npartitions=2) d.to_parquet(fn, partition_on=['a'], engine='fastparquet') pq_f = fastparquet.ParquetFile(fn) # OK with no filters out = dd.read_parquet(pq_f).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val]) # OK with filters out = dd.read_parquet(pq_f, filters=[('a', '==', 'B')]).compute() assert set(df.b[df.a == 'B']) == set(out.b) # Engine should not be set to 'pyarrow' with pytest.raises(AssertionError): out = dd.read_parquet(pq_f, engine='pyarrow')
def test_informative_error_messages(): with pytest.raises(ValueError) as info: dd.read_parquet('foo', engine='foo') assert 'foo' in str(info.value) assert 'arrow' in str(info.value) assert 'fastparquet' in str(info.value)
def test_read_parquet_custom_columns(tmpdir, engine): import glob tmp = str(tmpdir) data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'f': np.arange(1000, dtype=np.float64)}) df = dd.from_pandas(data, chunksize=50) df.to_parquet(tmp) df2 = dd.read_parquet(tmp, columns=['i32', 'f'], engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(df[['i32', 'f']], df2, check_index=False, check_divisions=should_check_divs(engine)) import glob fns = glob.glob(os.path.join(tmp, '*.parquet')) df2 = dd.read_parquet(fns, columns=['i32'], engine=engine).compute() df2.sort_values('i32', inplace=True) assert_eq(df[['i32']], df2, check_index=False, check_divisions=False) df3 = dd.read_parquet(tmp, columns=['f', 'i32'], engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(df[['f', 'i32']], df3, check_index=False, check_divisions=should_check_divs(engine))
def test_read_series(tmpdir, engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=engine) ddf2 = dd.read_parquet(fn, columns=['x'], engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(ddf[['x']], ddf2, check_divisions=should_check_divs(engine)) ddf2 = dd.read_parquet(fn, columns='x', index='myindex', engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(ddf.x, ddf2, check_divisions=should_check_divs(engine))
def test_read_series(tmpdir, engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=engine) ddf2 = dd.read_parquet(fn, columns=['x'], engine=engine) assert_eq(df[['x']], ddf2) ddf2 = dd.read_parquet(fn, columns='x', index='myindex', engine=engine) assert_eq(df.x, ddf2)
def test_pyarrow_raises_filters_categoricals(tmpdir): check_pyarrow() tmp = str(tmpdir) data = pd.DataFrame({"A": [1, 2]}) df = dd.from_pandas(data, npartitions=2) df.to_parquet(tmp, write_index=False, engine="pyarrow") with pytest.raises(NotImplementedError): dd.read_parquet(tmp, engine="pyarrow", filters=["A>1"])
def test_nonsense_column(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) with pytest.raises((ValueError, KeyError)): # fastparquet fails early, pyarrow only on compute dd.read_parquet(fn, columns=['nonesense'], engine=read_engine ).compute() with pytest.raises((Exception, KeyError)): # fastparquet fails early, pyarrow only on compute dd.read_parquet(fn, columns=['nonesense'] + list(ddf.columns), engine=read_engine).compute()
def test_read_parquet_custom_columns(tmpdir, engine): tmp = str(tmpdir) data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'f': np.arange(1000, dtype=np.float64)}) df = dd.from_pandas(data, chunksize=50) df.to_parquet(tmp) df2 = dd.read_parquet(tmp, columns=['i32', 'f'], engine=engine) assert_eq(df2, df2, check_index=False) df3 = dd.read_parquet(tmp, columns=['f', 'i32'], engine=engine) assert_eq(df3, df3, check_index=False)
def test_infer_divisions_no_index(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine, write_index=False) if read_engine == 'pyarrow' and not check_pa_divs: match = 'requires pyarrow >=0.9.0' ex = NotImplementedError else: match = 'no index column was discovered' ex = ValueError with pytest.raises(ex, match=match): dd.read_parquet(fn, engine=read_engine, infer_divisions=True)
def test_infer_divisions_not_sorted(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) if read_engine == 'pyarrow' and not check_pa_divs: match = 'requires pyarrow >=0.9.0' ex = NotImplementedError else: match = 'not known to be sorted across partitions' ex = ValueError with pytest.raises(ex, match=match): dd.read_parquet(fn, index='x', engine=read_engine, infer_divisions=True)
def test_passing_parquetfile(tmpdir): import shutil fp = pytest.importorskip('fastparquet') path = str(tmpdir) df = pd.DataFrame({"x": [1, 3, 2, 4]}) ddf = dd.from_pandas(df, npartitions=1) dd.to_parquet(ddf, path) pf = fp.ParquetFile(path) shutil.rmtree(path) # should pass, because no need to re-read metadata dd.read_parquet(pf)
def test_parquet_select_cats(tmpdir): check_fastparquet() fn = str(tmpdir) df = pd.DataFrame({ 'categories': pd.Series( np.random.choice(['a', 'b', 'c', 'd', 'e', 'f'], size=100), dtype='category'), 'ints': pd.Series(list(range(0, 100)), dtype='int'), 'floats': pd.Series(list(range(0, 100)), dtype='float')}) ddf = dd.from_pandas(df, 1) ddf.to_parquet(fn) rddf = dd.read_parquet(fn, columns=['ints']) assert list(rddf.columns) == ['ints'] rddf = dd.read_parquet(fn) assert list(rddf.columns) == list(df)
def test_no_index(tmpdir, write_engine, read_engine): fn = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) ddf = dd.from_pandas(df, npartitions=2) ddf.to_parquet(fn, write_index=False, engine=write_engine) ddf2 = dd.read_parquet(fn, engine=read_engine) assert_eq(df, ddf2, check_index=False)
def test_categories(fn): df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': list('caaab')}) ddf = dd.from_pandas(df, npartitions=2) ddf['y'] = ddf.y.astype('category') ddf.to_parquet(fn) ddf2 = dd.read_parquet(fn, categories=['y']) with pytest.raises(NotImplementedError): ddf2.y.cat.categories assert set(ddf2.y.compute().cat.categories) == {'a', 'b', 'c'} cats_set = ddf2.map_partitions(lambda x: x.y.cat.categories).compute() assert cats_set.tolist() == ['a', 'c', 'a', 'b'] assert_eq(ddf.y, ddf2.y, check_names=False) with pytest.raises(dask.async.RemoteException): # attempt to load as category that which is not so encoded ddf2 = dd.read_parquet(fn, categories=['x']).compute()
def test_read_list(tmpdir, write_engine, read_engine): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine=write_engine) files = sorted([os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith('_metadata')], key=natural_sort_key) # Infer divisions for engines/versions that support it ddf2 = dd.read_parquet(files, engine=read_engine, infer_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) assert_eq(ddf, ddf2, check_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) # No divisions ddf2_no_divs = dd.read_parquet(files, engine=read_engine, infer_divisions=False) assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
def test_read_list(tmpdir, write_engine, read_engine): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine=write_engine) files = sorted(os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith('_metadata')) ddf2 = dd.read_parquet(files, engine=read_engine) assert_eq(df, ddf2)
def test_timestamp_index(tmpdir, engine): fn = str(tmpdir) df = tm.makeTimeDataFrame() df.index.name = 'foo' ddf = dd.from_pandas(df, npartitions=5) ddf.to_parquet(fn, engine=engine) ddf2 = dd.read_parquet(fn, engine=engine) assert_eq(df, ddf2)
def test_timestamp_index(tmpdir, engine): fn = str(tmpdir) df = tm.makeTimeDataFrame() df.index.name = 'foo' ddf = dd.from_pandas(df, npartitions=5) ddf.to_parquet(fn, engine=engine) ddf2 = dd.read_parquet(fn, engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(ddf, ddf2, check_divisions=should_check_divs(engine))
def test_read_glob(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) if os.path.exists(os.path.join(fn, '_metadata')): os.unlink(os.path.join(fn, '_metadata')) files = os.listdir(fn) assert '_metadata' not in files # Infer divisions for engines/versions that support it ddf2 = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine, infer_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) assert_eq(ddf, ddf2, check_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) # No divisions ddf2_no_divs = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine, infer_divisions=False) assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
def test_partition_on(tmpdir, write_engine, read_engine): tmpdir = str(tmpdir) df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100)}) d = dd.from_pandas(df, npartitions=2) d.to_parquet(tmpdir, partition_on=['a'], engine=write_engine) out = dd.read_parquet(tmpdir, engine=read_engine).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val])
def test_empty(tmpdir, write_engine, read_engine, index): fn = str(tmpdir) df = pd.DataFrame({'a': ['a', 'b', 'b'], 'b': [4, 5, 6]})[:0] if index: df.set_index('a', inplace=True, drop=True) ddf = dd.from_pandas(df, npartitions=2) ddf.to_parquet(fn, write_index=index, engine=write_engine) read_df = dd.read_parquet(fn, engine=read_engine) assert_eq(ddf, read_df)
def test_read_glob(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) os.unlink(os.path.join(fn, '_metadata')) files = os.listdir(fn) assert '_metadata' not in files ddf2 = dd.read_parquet(os.path.join(fn, '*'), engine=read_engine) assert_eq(df, ddf2)
def test_timestamp96(tmpdir): check_fastparquet() fn = str(tmpdir) df = pd.DataFrame({'a': ['now']}, dtype='M8[ns]') ddf = dd.from_pandas(df, 1) ddf.to_parquet(fn, write_index=False, times='int96') pf = fastparquet.ParquetFile(fn) assert pf._schema[1].type == fastparquet.parquet_thrift.Type.INT96 out = dd.read_parquet(fn).compute() assert_eq(out, df)
def test_roundtrip(tmpdir, df, write_kwargs, read_kwargs): check_fastparquet() tmp = str(tmpdir) if df.index.name is None: df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, **write_kwargs) ddf2 = dd.read_parquet(tmp, index=df.index.name, **read_kwargs) assert_eq(ddf, ddf2)
def test_empty_partition(tmpdir, engine): fn = str(tmpdir) df = pd.DataFrame({"a": range(10), "b": range(10)}) ddf = dd.from_pandas(df, npartitions=5) ddf2 = ddf[ddf.a <= 5] ddf2.to_parquet(fn, engine=engine) ddf3 = dd.read_parquet(fn, engine=engine) sol = ddf2.compute() assert_eq(sol, ddf3, check_names=False, check_index=False)
def test_columns_no_index(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) ddf2 = ddf.reset_index() # No Index # -------- # All columns, none as index assert_eq(dd.read_parquet(fn, index=False, engine=read_engine, infer_divisions=False), ddf2, check_index=False, check_divisions=True) # Two columns, none as index assert_eq(dd.read_parquet(fn, index=False, columns=['x', 'y'], engine=read_engine, infer_divisions=False), ddf2[['x', 'y']], check_index=False, check_divisions=True) # One column and one index, all as columns assert_eq(dd.read_parquet(fn, index=False, columns=['myindex', 'x'], engine=read_engine, infer_divisions=False), ddf2[['myindex', 'x']], check_index=False, check_divisions=True)
def extend_meta(self, df): ''' Add data to the metadata by passing a dataframe with htid and the new columns.''' with ProgressBar(): new_ddf = self.ddf.join(df, on='htid') new_ddf.to_parquet(self.data_path + '.new') print('Extended files created. Deleting old files') for file in os.listdir(self.data_path): fname = os.path.join(self.data_path, file) os.remove(fname) os.removedirs(self.data_path) os.rename(self.data_path + '.new', self.data_path) self.ddf = dd.read_parquet(self.data_path, compression='snappy')
def test_read_list(tmpdir, write_engine, read_engine): tmpdir = str(tmpdir) ddf.to_parquet(tmpdir, engine=write_engine) files = sorted([ os.path.join(tmpdir, f) for f in os.listdir(tmpdir) if not f.endswith('_metadata') ], key=natural_sort_key) # Infer divisions for engines/versions that support it ddf2 = dd.read_parquet(files, engine=read_engine, infer_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) assert_eq(ddf, ddf2, check_divisions=should_check_divs(write_engine) and should_check_divs(read_engine)) # No divisions ddf2_no_divs = dd.read_parquet(files, engine=read_engine, infer_divisions=False) assert_eq(ddf.clear_divisions(), ddf2_no_divs, check_divisions=True)
def run(self): dsk = dd.read_parquet( os.getenv('local_location') + 'trading_history/*.parquet') #dsk['time'] = dd.to_datetime(dsk['time']) dsk['time'] = dsk['time'].astype("M8[D]") dsk = dsk.set_index('time') dsk = dsk[dsk['type'].isin(['DAILY_FINANCING'])] dsk = dsk[['amount', 'accountBalance', 'financing']] dsk['financing'] = dsk['financing'].fillna(0.0) dsk['financing'] = dsk['financing'].astype('float64') dsk['accountBalance'] = dsk['accountBalance'].astype('float64') df = dsk.compute() df['financing'] = df['financing'].cumsum(axis=0) #print(df.head()) self.create_graph(df)
async def fetch_ship_data(data_type: ShipDataTypes): valid_types = ['profile', 'discrete'] if data_type.value in valid_types: df = dd.read_parquet(SHIP_S3_MAP[data_type]).compute() df_json = json.loads(df.to_json(orient='records')) return {"status": "success", "result": df_json, "msg": ""} else: return { "status": "error", "result": None, "msg": f"{data_type} is invalid. Valid values: {', '.join(valid_types)}", }
def test_empty_partition(fn): df = pd.DataFrame({"a": range(10), "b": range(10)}) ddf = dd.from_pandas(df, npartitions=5) # fails as there are empty partitions ddf2 = ddf[ddf.a <= 5] ddf2.to_parquet(fn) ddf3 = dd.read_parquet(fn) assert_eq(ddf2.compute(), ddf3.compute(), check_names=False, check_index=False) ddf2 = ddf[ddf.a <= -5] with pytest.raises(ValueError): ddf2.to_parquet(fn)
def test_to_parquet_lazy(tmpdir, scheduler, engine): tmpdir = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [1., 2., 3., 4.]}) df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) value = ddf.to_parquet(tmpdir, compute=False, engine=engine) assert hasattr(value, 'dask') value.compute(scheduler=scheduler) assert os.path.exists(tmpdir) ddf2 = dd.read_parquet(tmpdir, engine=engine, infer_divisions=should_check_divs(engine)) assert_eq(ddf, ddf2, check_divisions=should_check_divs(engine))
def test_to_parquet_lazy(tmpdir, get): tmpdir = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [1., 2., 3., 4.]}) df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) value = ddf.to_parquet(tmpdir, compute=False) assert hasattr(value, 'dask') # assert not os.path.exists(tmpdir) value.compute(get=get) assert os.path.exists(tmpdir) ddf2 = dd.read_parquet(tmpdir) assert_eq(ddf, ddf2)
def _read_numeric_file(fname): try: return dd.read_parquet(fname) except: pass try: return dd.read_csv(fname) except: pass try: return np.load(fname) except: pass
def post(self, request): data = request.data project_slug = data['project'] pipeline_slug = data['pipeline'] fns = get_protein_quant_fn(project_slug, pipeline_slug) if len(fns) == 0: return JsonResponse({}) cols = ['Majority protein IDs', 'Score', 'Intensity'] ddf = dd.read_parquet(fns, engine="pyarrow")[cols] res = ddf.groupby(['Majority protein IDs' ]).mean().sort_values('Score').compute() response = {} response['protein_names'] = list(res.index) for col in res.columns: response[col] = res[col].to_list() return JsonResponse(response)
def opendamir_reduce_by_month(path2file): """ Function to reduce one month of opendamir into a database indexed by PRS_NAT_REF and with expenditure indicators as variables :param path2file: String, Monthly compressed file :return: pandas DataFrame, Monthly expenditure database with summed columns by act """ df = dd.read_parquet(path2file) # Filters : this can be adapted to specific problem df = df.loc[df["ETE_IND_TAA"] != 1, :] df = df.groupby(["PRS_NAT", "PSE_ACT_CAT"]).agg({ "PRS_PAI_MNT": "sum", "PRS_REM_MNT": "sum" }).compute() return df
def _export(self, name): # Read the data feature_path = self._full_feature_path(name) try: ddf = dd.read_parquet( feature_path, engine="pyarrow", storage_options=self._clean_dict(self.storage_options), ) # Repartition to optimise files on exported dataset ddf = ddf.repartition(partition_size="25MB") return ddf except Exception as e: # No data available return None
def test_partition_on_cats(tmpdir): check_fastparquet() tmp = str(tmpdir) d = pd.DataFrame({'a': np.random.rand(50), 'b': np.random.choice(['x', 'y', 'z'], size=50), 'c': np.random.choice(['x', 'y', 'z'], size=50)}) d = dd.from_pandas(d, 2) d.to_parquet(tmp, partition_on=['b'], engine='fastparquet') df = dd.read_parquet(tmp, engine='fastparquet') assert set(df.b.cat.categories) == {'x', 'y', 'z'} d.to_parquet(tmp, partition_on=['b', 'c'], engine='fastparquet') df = dd.read_parquet(tmp, engine='fastparquet') assert set(df.b.cat.categories) == {'x', 'y', 'z'} assert set(df.c.cat.categories) == {'x', 'y', 'z'} df = dd.read_parquet(tmp, columns=['a', 'c'], engine='fastparquet') assert set(df.c.cat.categories) == {'x', 'y', 'z'} assert 'b' not in df.columns df = dd.read_parquet(tmp, index='c', engine='fastparquet') assert set(df.index.categories) == {'x', 'y', 'z'} assert 'c' not in df.columns # series df = dd.read_parquet(tmp, columns='b', engine='fastparquet') assert set(df.cat.categories) == {'x', 'y', 'z'}
def download_rtd(self): """ Pull the Rtd.__tablename__ table from db, parse it and save it on disk. """ with ProgressBar(): rtd = dd.read_sql_table(self.__tablename__, DB_CONNECT_STRING, index_col='hash_id', meta=self.meta, npartitions=200) rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer') # write_metadata_file=False) rtd = dd.read_parquet(self.DATA_CACHE_PATH, engine='pyarrow') rtd = self._parse(rtd) self._save_encoders(rtd) # Save data to parquet. We have to use pyarrow as fastparquet does not support pd.Int64 rtd.to_parquet(self.DATA_CACHE_PATH, engine='pyarrow', schema='infer')
def _read( self, name, from_date=None, to_date=None, freq=None, time_travel=None, **kwargs ): # Identify which partitions to read filters = [] if from_date: filters.append(("time", ">=", pd.Timestamp(from_date))) if to_date: filters.append(("time", "<=", pd.Timestamp(to_date))) if kwargs.get("partitions"): for p in kwargs.get("partitions"): filters.append(("partition", "==", p)) filters = [filters] if filters else None # Read the data feature_path = self._full_feature_path(name) try: ddf = dd.read_parquet( feature_path, engine="pyarrow", filters=filters, storage_options=self._clean_dict(self.storage_options), ) ddf = ddf.repartition(partition_size="25MB") except PermissionError as e: raise e except Exception as e: # No data available empty_df = pd.DataFrame( columns=["time", "created_time", "value", "partition"] ).set_index("time") ddf = dd.from_pandas(empty_df, chunksize=1) if "partition" in ddf.columns: ddf = ddf.drop(columns="partition") # Apply time-travel if time_travel: ddf = ddf.reset_index() ddf = ddf[ddf.created_time <= ddf.time + pd.Timedelta(time_travel)] ddf = ddf.set_index("time") # De-serialize from JSON if required if kwargs.get("serialized"): ddf = ddf.map_partitions( lambda df: df.assign(value=df.value.apply(pd.io.json.loads)), meta={ "value": "object", "created_time": "datetime64[ns]", }, ) return ddf
def read_df(pattern, dbsystem='dask', sqlContext=None): """ Reads a set of data contained in a folder as a spark or dask DataFrame Parameters ---------- pattern : str Unix style wildcard pattern pointing to the files, for example /store/msrad/folder/*.csv will read all csv files in that folder dbsystem : str Either "dask" if you want a Dask DataFrame or "spark" if you want a spark dataframe sqlContext : sqlContext instance sqlContext to use, required only if dbystem = 'spark' Returns ------- A spark or dask DataFrame instance """ if dbsystem not in ['spark', 'dask']: raise NotImplementedError( 'Only dbsystem = "spark" or "dask" are supported!') if dbsystem == 'spark' and sqlContext == None: raise ValueError('sqlContext must be provided if dbystem = "spark"!') files = glob.glob(pattern) df = None if '.parq' in files[0] or '.parquet' in files[0]: # For some reason wildcards are not accepted with parquet if dbsystem == 'spark': df = sqlContext.read.parquet(*files) else: df = dd.read_parquet(pattern) elif '.csv' in files[0]: if dbsystem == 'spark': df = sqlContext.read.csv(pattern, header=True, inferSchema=True) else: if '.gz' in files[0]: df = dd.read_csv(pattern, compression='gzip') else: df = dd.read_csv(pattern) else: logging.error("""Invalid data, only csv and parquet files are accepted. Make sure that they have a valid suffix (.csv, .csv.gz, .parquet, .parq)""") return df
def _import_dask(storage, merge_records=False, dm_name=None, import_type='staging', return_dask_graph=False, connector_id=None, staging_name=None, view_name=None, columns=None, max_hits=None, mapping_columns=None): if columns: columns = list(set(columns)) columns += __STAGING_FIELDS columns = list(set(columns)) if import_type == 'golden': url = [storage.build_url_parquet_golden(dm_name=dm_name)] elif import_type == 'staging': url = [] url1 = storage.build_url_parquet_staging(staging_name=staging_name, connector_id=connector_id) if url1 is not None: url.append(url1) url2 = storage.build_url_parquet_staging_master( staging_name=staging_name, connector_id=connector_id) if url2 is not None: url.append(url2) url3 = storage.build_url_parquet_staging_rejected( staging_name=staging_name, connector_id=connector_id) if url3 is not None: url.append(url3) elif import_type == 'view': url = [storage.build_url_parquet_view(view_name=view_name)] else: raise KeyError('import_type should be `golden`,`staging` or `view`') d = dd.read_parquet(url, storage_options=storage.get_dask_options(), columns=columns) d = d.rename(columns=mapping_columns) if return_dask_graph: return d else: return d.compute()
def test_ordering(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def to_dask(self): """ Create a lazy dask-dataframe from the parquet data """ # More efficient to call dask function directly. self._load_metadata() columns = self._kwargs.get('columns', None) index = self._kwargs.get('index', None) filters = self._kwargs.get('filters', []) self._df = dd.read_parquet(self._urlpath, columns=columns, index=index, filters=filters) self._schema = None self.discover() # resets schema to dask's better version return self._df
def _to_dask(self): """ Create a lazy dask-dataframe from the parquet data """ import dask.dataframe as dd urlpath = self._get_cache(self._urlpath)[0] kw = dict(columns=self._kwargs.get('columns', None), index=self._kwargs.get('index', None), engine=self._kwargs.get('engine', 'auto')) if 'filters' in self._kwargs: kw['filters'] = self._kwargs['filters'] self._df = dd.read_parquet(urlpath, storage_options=self._storage_options, **kw) self._load_metadata() return self._df
def _persist_to_file(dataset: Union[str, dd.DataFrame], stage_i, stage_name, cache_dir): assert cache_dir is not None, "When using dask dataframe, cache dir must be provided" assert '/' not in stage_name \ and '\\' not in stage_name \ and ':' not in stage_name \ and '..' not in stage_name, "Unsafe stage symbols" cache_path = _get_cache_path(cache_dir, stage_i, stage_name) if isinstance(dataset, dd.DataFrame): dataset.to_parquet(cache_path, engine='fastparquet') elif isinstance(dataset, str): logger.debug("Moving {} to {}".format(dataset, cache_path)) shutil.move(dataset, cache_path) else: raise NotImplementedError() return dd.read_parquet(cache_path)
def combine_from_input(input_file, extractor, field, simulations_desc, num_procs=1, sample_size=1000, noise=True, error=True): temp_dir = tempfile.mkdtemp(prefix='/dev/shm/') try: levels_names = exec_in_subprocess(load_normalize_and_dump_data, input_file, extractor, simulations_desc, num_procs, temp_dir) df_dask = dd.read_parquet(temp_dir) return combine_sorted(df_dask, field, simulations_desc, levels_names, num_procs=num_procs, sample_size=sample_size, noise=noise, error=True) finally: shutil.rmtree(temp_dir)
def test_columns_index(tmpdir, write_engine, read_engine): fn = str(tmpdir) ddf.to_parquet(fn, engine=write_engine) # With Index # ---------- # ### Emtpy columns ### # With divisions if supported assert_eq(dd.read_parquet(fn, columns=[], engine=read_engine, infer_divisions=should_check_divs(read_engine)), ddf[[]], check_divisions=should_check_divs(read_engine)) # No divisions assert_eq(dd.read_parquet(fn, columns=[], engine=read_engine, infer_divisions=False), ddf[[]].clear_divisions(), check_divisions=True) # ### Single column, auto select index ### # With divisions if supported assert_eq(dd.read_parquet(fn, columns=['x'], engine=read_engine, infer_divisions=should_check_divs(read_engine)), ddf[['x']], check_divisions=should_check_divs(read_engine)) # No divisions assert_eq(dd.read_parquet(fn, columns=['x'], engine=read_engine, infer_divisions=False), ddf[['x']].clear_divisions(), check_divisions=True) # ### Single column, specify index ### # With divisions if supported assert_eq(dd.read_parquet(fn, index='myindex', columns=['x'], engine=read_engine, infer_divisions=should_check_divs(read_engine)), ddf[['x']], check_divisions=should_check_divs(read_engine)) # No divisions assert_eq(dd.read_parquet(fn, index='myindex', columns=['x'], engine=read_engine, infer_divisions=False), ddf[['x']].clear_divisions(), check_divisions=True) # ### Two columns, specify index ### # With divisions if supported assert_eq(dd.read_parquet(fn, index='myindex', columns=['x', 'y'], engine=read_engine, infer_divisions=should_check_divs(read_engine)), ddf, check_divisions=should_check_divs(read_engine)) # No divisions assert_eq(dd.read_parquet(fn, index='myindex', columns=['x', 'y'], engine=read_engine, infer_divisions=False), ddf.clear_divisions(), check_divisions=True)
def test_ordering(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, engine=write_engine) if read_engine == 'fastparquet': pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine) assert_eq(ddf, ddf2, check_divisions=False)
def test_copy_dask_to_gcs_dir( tmp_path, tmp_gcs_url_prefix, expected_dask_df, dask_flow, override_gcs_for_copy_if_fake_gcp, gcs_fs, ): cloud_url = tmp_gcs_url_prefix + "output" local_path = tmp_path / "output" dask_flow.get("dask_df", mode="FileCopier").copy(destination=cloud_url) gcs_fs.get(cloud_url, str(local_path), recursive=True) actual = dd.read_parquet(local_path) assert equal_frame_and_index_content(actual.compute(), expected_dask_df.compute())
def clean_cabs_at_path(special: bool, s3_in_url: str, s3_out_url: str, s3_options: Dict) -> bool: try: df = dd.read_parquet(path=s3_in_url, storage_options=s3_options, engine='fastparquet') # add cab zones if not special: print('In data clean tasks for cabs. Field dolocationid not found') # fetch cab zones taxi_zones_df: GeoDataFrame = fetch_cab_zones() df['dolocationid'] = df.map_partitions( partial(add_cab_zone, taxi_zone_df=taxi_zones_df, lon_var='dolongitude', lat_var='dolatitude', locid_var='dolocationid'), meta=('dolocationid', int64)) df['pulocationid'] = df.map_partitions( partial(add_cab_zone, taxi_zone_df=taxi_zones_df, lon_var='pulongitude', lat_var='pulatitude', locid_var='pulocationid'), meta=('pulocationid', int64)) del taxi_zones_df df = df[[ 'pudatetime', 'dodatetime', 'passengers', 'distance', 'dolocationid', 'pulocationid' ]] dd.to_parquet(df=df, path=s3_out_url, engine='fastparquet', compute=True, compression='GZIP', storage_options=s3_options) del df except Exception as err: print('error in clean_cabs_at_path %s' % str(err)) raise err else: return True
def test_hive_partitioned_data(tmpdir, cpu): # Initial timeseries dataset (in cpu memory). # Round the full "timestamp" to the hour for partitioning. ddf = dask.datasets.timeseries( start="2000-01-01", end="2000-01-03", freq="600s", partition_freq="6h", seed=42, ).reset_index() ddf["timestamp"] = ddf["timestamp"].dt.round("D").dt.day ds = nvt.Dataset(ddf, engine="parquet") # Write the dataset to disk path = str(tmpdir) partition_keys = ["timestamp", "name"] ds.to_parquet(path, partition_on=partition_keys) # Make sure the directory structure is hive-like df_expect = ddf.compute() df_expect = df_expect.sort_values(["id", "x", "y"]).reset_index(drop=True) timestamp_check = df_expect["timestamp"].iloc[0] name_check = df_expect["name"].iloc[0] assert glob.glob( os.path.join( path, f"timestamp={timestamp_check}/name={name_check}/*", )) # Read back with dask.dataframe and check the data df_check = dd.read_parquet(path).compute() df_check["name"] = df_check["name"].astype("object") df_check["timestamp"] = df_check["timestamp"].astype("int64") df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True) for col in df_expect: # Order of columns can change after round-trip partitioning assert_eq(df_expect[col], df_check[col], check_index=False) # Read back with NVT and check the data df_check = nvt.Dataset(path, engine="parquet").to_ddf().compute() df_check["name"] = df_check["name"].astype("object") df_check["timestamp"] = df_check["timestamp"].astype("int64") df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True) for col in df_expect: # Order of columns can change after round-trip partitioning assert_eq(df_expect[col], df_check[col], check_index=False)
def to_zarr(input_path: str, output_path: str, dictionary_path: str): import dask.dataframe as dd import fsspec import xarray as xr from dask.diagnostics import ProgressBar logger.info(f"Converting parquet at {input_path} to {output_path}") df = dd.read_parquet(input_path) trait_columns = df.columns[df.columns.to_series().str.match(r"^\d+")] # 41210_Z942 -> 41210 (UKB field id) trait_group_ids = [c.split("_")[0] for c in trait_columns] # 41210_Z942 -> Z942 (Data coding value as one-hot encoding in phenotype, e.g.) trait_code_ids = ["_".join(c.split("_")[1:]) for c in trait_columns] trait_values = df[trait_columns].astype("float").to_dask_array() trait_values.compute_chunk_sizes() trait_id_to_name = ( pd.read_csv( dictionary_path, sep=",", usecols=["FieldID", "Field"], dtype={"FieldID": str, "Field": str}, ) .set_index("FieldID")["Field"] .to_dict() ) trait_name = [trait_id_to_name.get(v) for v in trait_group_ids] ds = xr.Dataset( dict( id=("samples", np.asarray(df["userId"], dtype=int)), trait=(("samples", "traits"), trait_values), trait_id=("traits", np.asarray(trait_columns.values, dtype=str)), trait_group_id=("traits", np.array(trait_group_ids, dtype=int)), trait_code_id=("traits", np.array(trait_code_ids, dtype=str)), trait_name=("traits", np.array(trait_name, dtype=str)), ) ) # Keep chunks small in trait dimension for faster per-trait processing ds["trait"] = ds["trait"].chunk(dict(samples="auto", traits=100)) ds = ds.rename_vars({v: f"sample_{v}" for v in ds}) logger.info(f"Saving dataset to {output_path}:\n{ds}") with ProgressBar(): ds.to_zarr(fsspec.get_mapper(output_path), consolidated=True, mode="w") logger.info("Done")
def _parquet(self): """ Import parquet file :return dask DataFrame """ return dd.read_parquet( path=self.full_path, columns=None, filters=self.kwargs.get('filters'), categories=self.kwargs.get('categories'), index=self.kwargs.get('index'), storage_options=self.kwargs.get('storage_options'), engine='pyarrow', gather_statistics=self.kwargs.get('gather_statistics'), split_row_groups=self.kwargs.get('split_row_groups'), chunksize=self.kwargs.get('chunksize'))
def run(self) -> None: self.init() if self.dpath.suffix == ".parquet": df = dd.read_parquet(self.dpath) elif self.dpath.suffix == ".csv": df = dd.read_csv(self.dpath) times = [] cols = [] for col in df.columns: then = time() self.bench(col) times.append(time() - then) cols.append(col) result = {"name": self.__class__.__name__, "times": times, "columns": cols} print(jdumps(result))
def test_arrow_partitioning(tmpdir): # Issue #3518 pytest.importorskip('pyarrow') path = str(tmpdir) data = { 'p': np.repeat(np.arange(3), 2).astype(np.int8), 'b': np.repeat(-1, 6).astype(np.int16), 'c': np.repeat(-2, 6).astype(np.float32), 'd': np.repeat(-3, 6).astype(np.float64), } pdf = pd.DataFrame(data) ddf = dd.from_pandas(pdf, npartitions=2) ddf.to_parquet(path, engine='pyarrow', partition_on='p') ddf = dd.read_parquet(path, engine='pyarrow') ddf.astype({'b': np.float32}).compute()