コード例 #1
0
def main_script(plugin_name, start_date, end_date, plugin_dir, data_dir,
                worker_num,
                results_dir='./stats',
                events_limit=0):
    """
    Running pyAloha stats processing pipeline.

    0. Load worker, aggregator, processor classes from a specified plugin
    1. Run workers (data preprocessors) on aloha files within specified range
    2. Accumulate [and postprocess] worker results with an aggregator instance
    3. Run stats processor and print results to stdout
    """
    aggregator = aggregate_raw_data(
        data_dir, results_dir, plugin_dir, plugin_name,
        start_date, end_date, events_limit,
        worker_num=worker_num
    )

    stats = load_plugin(
        plugin_name, plugin_dir=plugin_dir
    ).StatsProcessor(aggregator)

    logger = multiprocessing.get_logger()

    logger.info('Stats: processing')
    stats.process_stats()

    logger.info('Stats: outputting')
    stats.print_stats()

    logger.info('Stats: done')
コード例 #2
0
ファイル: main.py プロジェクト: vicpopov/Alohalytics
def run(plugin_name, start_date, end_date, plugin_dir,
        data_dir='/mnt/disk1/alohalytics/by_date',
        results_dir='./stats',
        events_limit=0):
    """
    Pyaloha stats processing pipeline:
0. Load worker, aggregator, processor classes from a specified plugin (script)
1. Run workers (data preprocessors) on alohalytics files within specified range
2. Accumulate [and postprocess] worker results with an aggregator instance
3. Run stats processor and print results to stdout
    """

    aggregator = aggregate_raw_data(
        data_dir, results_dir, plugin_dir, plugin_name,
        start_date, end_date, events_limit
    )

    stats = load_plugin(
        plugin_name, plugin_dir=plugin_dir
    ).StatsProcessor(aggregator)

    logger = multiprocessing.get_logger()

    logger.info('Stats: processing')
    stats.process_stats()

    logger.info('Stats: outputting')
    stats.print_stats()

    logger.info('Stats: done')
コード例 #3
0
def run(plugin_name,
        start_date,
        end_date,
        plugin_dir,
        data_dir='/mnt/disk1/alohalytics/by_date',
        results_dir='./stats',
        events_limit=0):
    """
    Pyaloha stats processing pipeline:
0. Load worker, aggregator, processor classes from a specified plugin (script)
1. Run workers (data preprocessors) on alohalytics files within specified range
2. Accumulate [and postprocess] worker results with an aggregator instance
3. Run stats processor and print results to stdout
    """

    aggregator = aggregate_raw_data(data_dir, results_dir, plugin_dir,
                                    plugin_name, start_date, end_date,
                                    events_limit)

    stats = load_plugin(plugin_name,
                        plugin_dir=plugin_dir).StatsProcessor(aggregator)

    logger = multiprocessing.get_logger()

    logger.info('Stats: processing')
    stats.process_stats()

    logger.info('Stats: outputting')
    stats.print_stats()

    logger.info('Stats: done')
コード例 #4
0
ファイル: main.py プロジェクト: vicpopov/Alohalytics
def aggregate_raw_data(
        data_dir, results_dir, plugin_dir, plugin,
        start_date=None, end_date=None,
        events_limit=0,
        worker_num=3 * multiprocessing.cpu_count() / 4):
    """
    Workers-aggregator subpipeline:
0. Load worker, aggregator classes from a specified plugin
1. Run workers in parallel (basing on server stats files)
2. Accumulate results by an aggregator
3. Run aggregator post processing
    """
    setup_logs()
    logger = multiprocessing.get_logger()

    pool = multiprocessing.Pool(worker_num)

    try:
        files = [
            os.path.join(data_dir, fname)
            for fname in os.listdir(data_dir)
            if check_fname(fname, start_date, end_date)
        ]

        items = (
            (plugin_dir, plugin, fpath, events_limit)
            for fpath in files
        )

        aggregator = load_plugin(
            plugin, plugin_dir=plugin_dir
        ).DataAggregator(results_dir)

        logger.info('Aggregator: aggregate')
        for i, results in enumerate(pool.imap(invoke_cmd_worker, items)):
            try:
                aggregator.aggregate(WorkerResults.loads_object(results))
            except Exception:
                logger.error(
                    'Aggregator: processing of %s failed: %s' % (
                        files[i],
                        traceback.format_exc()
                    )
                )

        logger.info('Aggregator: post_aggregate')
        aggregator.post_aggregate(pool)

        logger.info('Aggregator: done')
    finally:
        pool.terminate()
        pool.join()

    return aggregator
コード例 #5
0
def aggregate_raw_data(data_dir,
                       results_dir,
                       plugin_dir,
                       plugin,
                       start_date=None,
                       end_date=None,
                       events_limit=0,
                       worker_num=3 * multiprocessing.cpu_count() / 4):
    """
    Workers-aggregator subpipeline:
0. Load worker, aggregator classes from a specified plugin
1. Run workers in parallel (basing on server stats files)
2. Accumulate results by an aggregator
3. Run aggregator post processing
    """
    setup_logs()
    logger = multiprocessing.get_logger()

    pool = multiprocessing.Pool(worker_num)

    try:
        files = [
            os.path.join(data_dir, fname) for fname in os.listdir(data_dir)
            if check_fname(fname, start_date, end_date)
        ]

        items = ((plugin_dir, plugin, fpath, events_limit) for fpath in files)

        aggregator = load_plugin(
            plugin, plugin_dir=plugin_dir).DataAggregator(results_dir)

        logger.info('Aggregator: aggregate')
        for i, results in enumerate(pool.imap(invoke_cmd_worker, items)):
            try:
                aggregator.aggregate(WorkerResults.loads_object(results))
            except Exception:
                logger.error('Aggregator: processing of %s failed: %s' %
                             (files[i], traceback.format_exc()))

        logger.info('Aggregator: post_aggregate')
        aggregator.post_aggregate(pool)

        logger.info('Aggregator: done')
    finally:
        pool.terminate()
        pool.join()

    return aggregator
コード例 #6
0
def aggregate_raw_data(data_dir,
                       results_dir,
                       plugin_dir,
                       plugin,
                       start_date=None,
                       end_date=None,
                       events_limit=0,
                       worker_num=3 * multiprocessing.cpu_count() / 4):
    """
    Workers-aggregator subpipeline:
0. Load worker, aggregator classes from a specified plugin
1. Run workers in parallel (basing on server stats files)
2. Accumulate results by an aggregator
3. Run aggregator post processing
    """
    setup_logs()
    logger = multiprocessing.get_logger()

    pool = multiprocessing.Pool(worker_num)

    try:
        items = ((plugin_dir, plugin, os.path.join(data_dir,
                                                   fname), events_limit)
                 for fname in os.listdir(data_dir)
                 if check_fname(fname, start_date, end_date))

        aggregator = load_plugin(
            plugin, plugin_dir=plugin_dir).DataAggregator(results_dir)

        logger.info('Aggregator: aggregate')
        for results in pool.imap_unordered(invoke_cmd_worker, items):
            aggregator.aggregate(WorkerResults.loads_object(results))

        logger.info('Aggregator: post_aggregate')
        aggregator.post_aggregate(pool)

        logger.info('Aggregator: done')
    finally:
        pool.terminate()
        pool.join()

    return aggregator
コード例 #7
0
ファイル: main.py プロジェクト: mapsme/Alohalytics
def aggregate_raw_data(
        data_dir, results_dir, plugin_dir, plugin,
        start_date=None, end_date=None,
        events_limit=0,
        worker_num=3 * multiprocessing.cpu_count() / 4):
    """
    Workers-aggregator subpipeline:
0. Load worker, aggregator classes from a specified plugin
1. Run workers in parallel (basing on server stats files)
2. Accumulate results by an aggregator
3. Run aggregator post processing
    """
    setup_logs()
    logger = multiprocessing.get_logger()

    pool = multiprocessing.Pool(worker_num)

    try:
        items = (
            (plugin_dir, plugin, os.path.join(data_dir, fname), events_limit)
            for fname in os.listdir(data_dir)
            if check_fname(fname, start_date, end_date)
        )

        aggregator = load_plugin(
            plugin, plugin_dir=plugin_dir
        ).DataAggregator(results_dir)

        logger.info('Aggregator: aggregate')
        for results in pool.imap_unordered(invoke_cmd_worker, items):
            aggregator.aggregate(WorkerResults.loads_object(results))

        logger.info('Aggregator: post_aggregate')
        aggregator.post_aggregate(pool)

        logger.info('Aggregator: done')
    finally:
        pool.terminate()
        pool.join()

    return aggregator
コード例 #8
0
ファイル: main.py プロジェクト: vicpopov/Alohalytics
def aggregate_raw_data(data_dir,
                       results_dir,
                       plugin_dir,
                       plugin,
                       start_date=None,
                       end_date=None,
                       events_limit=0,
                       worker_num=DEFAULT_WORKER_NUM):
    """Workers-aggregator subpipeline.

    0. Load worker, aggregator classes from a specified plugin
    1. Run workers in parallel (basing on server stats files)
    2. Accumulate results by an aggregator
    3. Run aggregator post processing
    """
    setup_logs()
    logger = multiprocessing.get_logger()

    files = [
        os.path.join(data_dir, fname) for fname in sorted(os.listdir(data_dir))
        if check_fname(fname, start_date, end_date)
    ]

    tasks = [(plugin_dir, plugin, fpath, events_limit) for fpath in files]

    aggregator = load_plugin(plugin,
                             plugin_dir=plugin_dir).DataAggregator(results_dir)

    logger.info('Aggregator: start workers')

    # Let us create pools before main process will consume more memory
    # and let workers live forever (default) to exclude spontaneous forking
    worker_pool = multiprocessing.Pool(worker_num)
    # Just to be 100% safe we have no leaks, let us use separate pools
    # for work-aggregate phase and post aggregation.
    # Also, most of the post aggregation tasks heavily depend on the disk IO
    # so we do not need so much workers.
    post_aggregator_pool = multiprocessing.Pool(worker_num // 2)
    try:
        engine = worker_pool.imap_unordered
        batch_size = 2 * worker_num
        batch_number = len(tasks) // batch_size + 1
        for batch_no in range(batch_number):
            batch_start = batch_no * batch_size
            batch_tasks = tasks[batch_start:batch_start + batch_size]
            logger.info('Aggregator: batch %d is being aggregated: %s' %
                        (batch_no, batch_tasks))
            for file_name, results in engine(invoke_cmd_worker, batch_tasks):
                try:
                    results = WorkerResults.loads_object(results)
                    logger.info('Aggregator: task %s is being aggregated' %
                                file_name)
                    aggregator.aggregate(results)
                    logger.info('Aggregator: task %s done' % file_name)
                except Exception as e:
                    logger.exception('Aggregator: task %s failed:\n%s',
                                     file_name, e)
    finally:
        worker_pool.terminate()
        worker_pool.join()

    logger.info('Aggregator: post_aggregate')

    try:
        aggregator.post_aggregate(pool=post_aggregator_pool)
    finally:
        post_aggregator_pool.terminate()
        post_aggregator_pool.join()

    logger.info('Aggregator: done')

    return aggregator