Esempio n. 1
0
 def getDataFrame(self, start_time, end_time, npartitions=10):
     bag = self.bag(start_time=start_time,
                    end_time=end_time,
                    npartitions=npartitions)
     df = bag.to_dataframe(meta={'ts': int, 'key': str, 'dbl_v': float})
     df = df.compute().pivot_table(index='ts',
                                   columns='key',
                                   values='dbl_v')
     return df
Esempio n. 2
0
for num_workers in [2, 3, 4]:
    test_name = "dsk_filter_cnt_{}_{}".format('threaded', num_workers)
    LOGGER.info('BEGIN: Running test: {}'.format(test_name))

    LOGGER.info('START: Creating dask bag with filter')
    bag = dask.bag.read_avro(
        URLPATH1,
        storage_options={'config_kwargs': {
            'max_pool_connections': 500
        }},
        blocksize=None)
    bag = bag.filter(filter_func)
    LOGGER.info('FINISH: Dask bag created')

    LOGGER.info('START: Creating dask dataframe')
    df = bag.to_dataframe(meta={'payload': 'object', 'metadata': 'object'})
    LOGGER.info('FINISH: Dask dataframe created')

    LOGGER.info('START: Starting count')
    cnt = df.payload.count().compute(get=th_get, num_workers=num_workers)
    LOGGER.info('FINISH: Count is %s', cnt)

    LOGGER.info('COMPLETE: Running test: {}'.format(test_name))

test_name = "dsk_filter_cnt_{}_{}".format('synchronous', 1)
LOGGER.info('BEGIN: Running test: {}'.format(test_name))

LOGGER.info('START: Creating dask bag with filter')
bag = dask.bag.read_avro(
    URLPATH1,
    storage_options={'config_kwargs': {
import sys
import os
import dask.bag

import logger

MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__)
TEST_NAME = os.path.splitext(MODULE_NAME)[0]
LOGGER = logger.get_logger(TEST_NAME)

# Specify some constants
URLPATH = "s3://dask-avro-data/application-data/app-100*.avro"

# Start
LOGGER.info('START: Creating dask bag')
bag = dask.bag.read_avro(URLPATH)
LOGGER.info('FINISH: Dask bag created')

LOGGER.info('START: Creating dask dataframe')
df = bag.to_dataframe()
LOGGER.info('FINISH: Dask dataframe created')

LOGGER.info('START: Starting count')
cnt = len(df.payload)
LOGGER.info('FINISH: Count is %s', cnt)
def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('sqldump',
                        type=str,
                        help='Path to Robinhood MySQL database dump file')
    parser.add_argument(
        '-b',
        '--blocksize',
        type=str,
        default='128MiB',
        help=
        'Block size for CSV ingest; passed to dask.bag.read_text (default: 128MiB)'
    )
    parser.add_argument(
        '--save-csv',
        type=str,
        default=None,
        help=
        "Path to which full list of (type, size) entries should be saved as CSV (default: do not save)"
    )
    parser.add_argument(
        '--dask-scheduler',
        type=str,
        default='processes',
        help=
        "Dask scheduler to use (threads, processes, or single-threaded; default: processes)"
    )
    parser.add_argument(
        '-o',
        '--output',
        type=str,
        default='histogram.csv',
        help=
        "Path to CSV file in which inode size histograms should be saved (default: histogram.csv)"
    )
    args = parser.parse_args(argv)

    bag = dask.bag.read_text(
        args.sqldump,
        blocksize=args.blocksize).map(parse_values).flatten().map(
            lambda x: {k: x[k]
                       for k in ('size', 'type')})
    df = bag.to_dataframe(meta={'size': numpy.int64, 'type': str})

    if args.save_csv:
        with dask.config.set(scheduler=args.dask_scheduler):
            df.to_csv(args.save_csv, index=False, single_file=True)
        print("Saved full inode size and type list to %s" % args.save_csv)

    df['bin'] = df['size'].map(bin_inode_size)

    histograms = df.groupby(['type', 'bin']).count()

    with dask.config.set(scheduler=args.dask_scheduler):
        histograms_df = histograms.compute().unstack('type')

    histograms_df.columns = histograms_df.columns.droplevel()
    max_idx = max(histograms_df.index.values)
    new_index = range(0, max_idx)
    histograms_df = histograms_df.reindex(new_index).fillna(0).astype(
        numpy.int64)
    histograms_df.index = [
        bin_inode_size_inv(x) for x in histograms_df.index.values
    ]
    histograms_df.index.name = 'size'
    histograms_df.to_csv(args.output)
    print("Saved the following histograms to %s:" % args.output)
    print(histograms_df)