def getDataFrame(self, start_time, end_time, npartitions=10): bag = self.bag(start_time=start_time, end_time=end_time, npartitions=npartitions) df = bag.to_dataframe(meta={'ts': int, 'key': str, 'dbl_v': float}) df = df.compute().pivot_table(index='ts', columns='key', values='dbl_v') return df
for num_workers in [2, 3, 4]: test_name = "dsk_filter_cnt_{}_{}".format('threaded', num_workers) LOGGER.info('BEGIN: Running test: {}'.format(test_name)) LOGGER.info('START: Creating dask bag with filter') bag = dask.bag.read_avro( URLPATH1, storage_options={'config_kwargs': { 'max_pool_connections': 500 }}, blocksize=None) bag = bag.filter(filter_func) LOGGER.info('FINISH: Dask bag created') LOGGER.info('START: Creating dask dataframe') df = bag.to_dataframe(meta={'payload': 'object', 'metadata': 'object'}) LOGGER.info('FINISH: Dask dataframe created') LOGGER.info('START: Starting count') cnt = df.payload.count().compute(get=th_get, num_workers=num_workers) LOGGER.info('FINISH: Count is %s', cnt) LOGGER.info('COMPLETE: Running test: {}'.format(test_name)) test_name = "dsk_filter_cnt_{}_{}".format('synchronous', 1) LOGGER.info('BEGIN: Running test: {}'.format(test_name)) LOGGER.info('START: Creating dask bag with filter') bag = dask.bag.read_avro( URLPATH1, storage_options={'config_kwargs': {
import sys import os import dask.bag import logger MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__) TEST_NAME = os.path.splitext(MODULE_NAME)[0] LOGGER = logger.get_logger(TEST_NAME) # Specify some constants URLPATH = "s3://dask-avro-data/application-data/app-100*.avro" # Start LOGGER.info('START: Creating dask bag') bag = dask.bag.read_avro(URLPATH) LOGGER.info('FINISH: Dask bag created') LOGGER.info('START: Creating dask dataframe') df = bag.to_dataframe() LOGGER.info('FINISH: Dask dataframe created') LOGGER.info('START: Starting count') cnt = len(df.payload) LOGGER.info('FINISH: Count is %s', cnt)
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('sqldump', type=str, help='Path to Robinhood MySQL database dump file') parser.add_argument( '-b', '--blocksize', type=str, default='128MiB', help= 'Block size for CSV ingest; passed to dask.bag.read_text (default: 128MiB)' ) parser.add_argument( '--save-csv', type=str, default=None, help= "Path to which full list of (type, size) entries should be saved as CSV (default: do not save)" ) parser.add_argument( '--dask-scheduler', type=str, default='processes', help= "Dask scheduler to use (threads, processes, or single-threaded; default: processes)" ) parser.add_argument( '-o', '--output', type=str, default='histogram.csv', help= "Path to CSV file in which inode size histograms should be saved (default: histogram.csv)" ) args = parser.parse_args(argv) bag = dask.bag.read_text( args.sqldump, blocksize=args.blocksize).map(parse_values).flatten().map( lambda x: {k: x[k] for k in ('size', 'type')}) df = bag.to_dataframe(meta={'size': numpy.int64, 'type': str}) if args.save_csv: with dask.config.set(scheduler=args.dask_scheduler): df.to_csv(args.save_csv, index=False, single_file=True) print("Saved full inode size and type list to %s" % args.save_csv) df['bin'] = df['size'].map(bin_inode_size) histograms = df.groupby(['type', 'bin']).count() with dask.config.set(scheduler=args.dask_scheduler): histograms_df = histograms.compute().unstack('type') histograms_df.columns = histograms_df.columns.droplevel() max_idx = max(histograms_df.index.values) new_index = range(0, max_idx) histograms_df = histograms_df.reindex(new_index).fillna(0).astype( numpy.int64) histograms_df.index = [ bin_inode_size_inv(x) for x in histograms_df.index.values ] histograms_df.index.name = 'size' histograms_df.to_csv(args.output) print("Saved the following histograms to %s:" % args.output) print(histograms_df)