def parquet_to_dask(context: MLClientCtx,
                    parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
                    inc_cols: Optional[List[str]] = None,
                    index_cols: Optional[List[str]] = None,
                    shards: int = 4,
                    threads_per: int = 4,
                    processes: bool = False,
                    memory_limit: str = '2GB',
                    persist: bool = True,
                    dask_key: str = 'my_dask_dataframe',
                    target_path: str = '') -> None:
    """Load parquet dataset into dask cluster
    
    If no cluster is found loads a new one and persist the data to it. It
    shouold not be necessary to create a new cluster when the function
    is run as a 'dask' job.
    
    :param context:         the function context
    :param parquet_url:     url of the parquet file or partitioned dataset as either
                            artifact DataItem, string, or path object (see pandas read_csv)
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param shards:          number of workers to launch
    :param threads_per:     number of threads per worker
    :param processes:       
    """
    if hasattr(context, 'dask_client'):
        context.logger.info('found cluster...')
        dask_client = context.dask_client
    else:
        context.logger.info('starting new cluster...')
        cluster = LocalCluster(n_workers=shards,
                               threads_per_worker=threads_per,
                               processes=processes,
                               memory_limit=memory_limit)
        dask_client = Client(cluster)

    context.logger.info(dask_client)

    df = dd.read_parquet(parquet_url)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client

        # share the scheduler
        filepath = os.path.join(target_path, 'scheduler.json')
        dask_client.write_scheduler_file(filepath)
        context.log_artifact('scheduler', target_path=filepath)

        print(df.head())
Beispiel #2
0
import os
import numpy as np
import pandas as pd
import cudf
import dask_cudf
from dask.distributed import Client

np.random.seed(12)
df = cudf.DataFrame([('a', list(range(20))),
    ('b', list(reversed(range(20)))),
    ('c', list(range(20)))])
print(df)

ddf = dask_cudf.from_cudf(df, npartitions=2) 
print(ddf.persist())

client = Client("127.0.0.1:8487")
client.publish_dataset(shared_dataset1=ddf)
print(client.list_datasets())

# df = dd.read_csv('s3://dask-data/nyc-taxi/2015/*.csv',
#                  parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
#                  storage_options={'anon': True})

print('read file')
# https://www.gov.uk/guidance/about-the-price-paid-data#download-options
column_names = [
    'id', 'price', 'transfer_date', 'postcode', 'property_type', 'old_new',
    'duration', 'primary_address_obj', 'secondary_address_obj', 'street',
    'locality', 'city_town', 'district', 'county', 'ppd_cat', 'record_stat'
]

### dont set an index for saving parquet format
# df = dd.read_csv('./propdata/pp-complete.csv', header=None, names=column_names).set_index('id')

df = dd.read_csv('./propdata/pp-complete.csv', header=None, names=column_names)

# print(df.head())

print('persist dataframe')
df = client.persist(df)
#
print('publish dataframe')
df = client.publish_dataset(prop_paid=df)

# print('run calc')
# x = client.submit(lambda a: a.shape, df).result()
#
# # print(df)
# print(x)