コード例 #1
0
ファイル: generator.py プロジェクト: zta6/eventkit-cloud
def get_accessors():
    return {
        # Get the size in MBs per unit area (valid for tasks objects)
        "size": lambda t, area_km: t.result.size / area_km,
        # Get the duration per unit area (valid for export_run, data_provider_task_records, or export_task_records)
        "duration":
        lambda o, area_km: parse_duration(getattr(o, "duration", 0)) / area_km,
        # Get the area from the run or use the parent's area
        "area": lambda o, area_km: area_km,
    }
コード例 #2
0
 def with_timedelta(td):
     self.assertEqual(td.seconds, parse_duration(str(td)))
コード例 #3
0
def compute_statistics(export_task_records,
                       get_group,
                       tile_grid=get_default_tile_grid(),
                       filename=None):
    """
    :param export_task_records: ExporTaskRecords is a list of all export tasks
    :param get_group: Function to generate a group id given a DataExportProviderTask
    :param tile_grid: Calculate statistics for each tile in the tile grid
    :param filename: Serializes the intermediate data-sample data so it can be shared btw different deployments
    :return: A dict with statistics including area, duration, and package size per sq. kilometer
    """

    # Method to pull normalized data values off of the run, provider_task, or provider_task.task objects
    accessors = {
        # Get the size in MBs per unit area (valid for tasks objects)
        'size': lambda t, area_km: t.result.size / area_km,
        # Get the duration per unit area (valid for runs, provider_tasks, or tasks)
        'duration': lambda o, area_km: parse_duration(o.duration) / area_km,
        # Get the area from the run or use the parent's area
        'area': lambda o, area_km: area_km,
    }

    # TODO: Better way for select distinct on etr??
    processed_runs = {}
    processed_dptr = {}
    tid_cache = {}
    geom_cache = {}
    export_task_count = 0
    processed_count = 0
    total_count = export_task_records.count()
    all_stats = {}
    default_stat = {'duration': [], 'area': [], 'size': [], 'mpp': []}

    logger.debug('Prefetching geometry data from all Jobs')
    prefetch_geometry_cache(geom_cache)

    logger.info('Beginning collection of statistics for %d ExportTaskRecords',
                total_count)
    for etr in export_task_records:
        if processed_count % 500 == 0:
            logger.debug('Processed %d of %d using %d completed',
                         processed_count, total_count, export_task_count)
        processed_count += 1

        if etr.status != "SUCCESS" \
                or etr.export_provider_task.status != "COMPLETED" \
                or etr.export_provider_task.run.status != "COMPLETED" \
                or not is_valid_result(etr.result):
            continue

        export_task_count += 1

        dptr = etr.export_provider_task
        run = etr.export_provider_task.run

        gce = lookup_cache_geometry(run, geom_cache)
        area = gce['area']

        group_name = get_group(dptr)
        global_stats = get_child_entry(all_stats, 'GLOBAL', default_stat)
        group_stats = get_child_entry(all_stats, group_name, default_stat)
        task_stats = get_child_entry(group_stats, etr.name, default_stat)

        if has_tiles(etr.name):
            affected_tile_stats = get_tile_stats(group_stats, tile_grid,
                                                 gce['bbox'], True, tid_cache,
                                                 run.id)
        else:
            affected_tile_stats = []

        if run.id not in processed_runs:
            processed_runs[run.id] = True
            collect_samples(run, [global_stats], ['duration', 'area'],
                            accessors, area)
        if dptr.id not in processed_dptr:
            processed_dptr[dptr.id] = True
            collect_samples(dptr, [group_stats], ['duration', 'area'],
                            accessors, area)

        collect_samples(etr, affected_tile_stats + [task_stats],
                        ['duration', 'area', 'size'], accessors, area)

        sz = accessors['size'](etr, area)
        group_stats['size'] += [sz]  # Roll-up into provider_task level
        global_stats['size'] += [sz]  # Roll-up into global level

        # Collect a sample of the megabytes per pixel
        if has_tiles(etr.name):
            try:
                provider = DataProvider.objects.get(name=dptr.name)
                mpp = compute_mpp(provider, gce['bbox'], etr.result.size)
                if len(group_stats['mpp']) < MAX_SAMPLES_PER_TARGET:
                    group_stats['mpp'] += [mpp]
                if len(global_stats['mpp']) < MAX_SAMPLES_PER_TARGET:
                    global_stats['mpp'] += [mpp]
                for ts in affected_tile_stats:
                    if len(ts['mpp']) < MAX_SAMPLES_PER_TARGET:
                        ts['mpp'] += [mpp]

            except ObjectDoesNotExist:
                pass

    logger.info(
        'Computing statistics across %d completed ExportTaskRecords (geom_cache_misses=%d)',
        export_task_count, _dbg_geom_cache_misses)

    # TODO: Merge in any auxiliary sample data?

    if filename is not None:
        all_stats['timestamp'] = datetime.datetime.now()
        with open(filename, 'w') as os:
            json.dump(all_stats, os)

    totals = {
        'run_count': len(processed_runs),
        'data_provider_task_count': len(processed_dptr),
        'export_task_count': export_task_count
    }

    for group_name in all_stats:
        if group_name in ['timestamp']:
            continue

        totals[group_name] = get_summary_stats(
            all_stats[group_name], ('area', 'duration', 'size', 'mpp'))
        tile_count = 0

        for task_name in all_stats[group_name]:
            if task_name in ['duration', 'area', 'size', 'mpp']:
                # These are properties on the roll'ed up statistics
                continue
            elif task_name.startswith('tile_'):
                # Two-level map, index by y then x+z
                y_s = all_stats[group_name][task_name]
                total_ys = {}
                totals[group_name][task_name] = total_ys
                for xz_s in y_s:
                    total_ys[xz_s] = get_summary_stats(
                        y_s[xz_s], ('area', 'duration', 'size', 'mpp'))
                    total_ys[xz_s]['tile_coord'] = y_s[xz_s]['tile_coord']
                    tile_count += 1
            else:
                totals[group_name][task_name] = get_summary_stats(
                    all_stats[group_name][task_name],
                    ('area', 'duration', 'size'))

        totals[group_name]['tile_count'] = tile_count
        logger.info('Generated statistics for %d tiles for group %s',
                    tile_count, group_name)

    return totals