def get_accessors(): return { # Get the size in MBs per unit area (valid for tasks objects) "size": lambda t, area_km: t.result.size / area_km, # Get the duration per unit area (valid for export_run, data_provider_task_records, or export_task_records) "duration": lambda o, area_km: parse_duration(getattr(o, "duration", 0)) / area_km, # Get the area from the run or use the parent's area "area": lambda o, area_km: area_km, }
def with_timedelta(td): self.assertEqual(td.seconds, parse_duration(str(td)))
def compute_statistics(export_task_records, get_group, tile_grid=get_default_tile_grid(), filename=None): """ :param export_task_records: ExporTaskRecords is a list of all export tasks :param get_group: Function to generate a group id given a DataExportProviderTask :param tile_grid: Calculate statistics for each tile in the tile grid :param filename: Serializes the intermediate data-sample data so it can be shared btw different deployments :return: A dict with statistics including area, duration, and package size per sq. kilometer """ # Method to pull normalized data values off of the run, provider_task, or provider_task.task objects accessors = { # Get the size in MBs per unit area (valid for tasks objects) 'size': lambda t, area_km: t.result.size / area_km, # Get the duration per unit area (valid for runs, provider_tasks, or tasks) 'duration': lambda o, area_km: parse_duration(o.duration) / area_km, # Get the area from the run or use the parent's area 'area': lambda o, area_km: area_km, } # TODO: Better way for select distinct on etr?? processed_runs = {} processed_dptr = {} tid_cache = {} geom_cache = {} export_task_count = 0 processed_count = 0 total_count = export_task_records.count() all_stats = {} default_stat = {'duration': [], 'area': [], 'size': [], 'mpp': []} logger.debug('Prefetching geometry data from all Jobs') prefetch_geometry_cache(geom_cache) logger.info('Beginning collection of statistics for %d ExportTaskRecords', total_count) for etr in export_task_records: if processed_count % 500 == 0: logger.debug('Processed %d of %d using %d completed', processed_count, total_count, export_task_count) processed_count += 1 if etr.status != "SUCCESS" \ or etr.export_provider_task.status != "COMPLETED" \ or etr.export_provider_task.run.status != "COMPLETED" \ or not is_valid_result(etr.result): continue export_task_count += 1 dptr = etr.export_provider_task run = etr.export_provider_task.run gce = lookup_cache_geometry(run, geom_cache) area = gce['area'] group_name = get_group(dptr) global_stats = get_child_entry(all_stats, 'GLOBAL', default_stat) group_stats = get_child_entry(all_stats, group_name, default_stat) task_stats = get_child_entry(group_stats, etr.name, default_stat) if has_tiles(etr.name): affected_tile_stats = get_tile_stats(group_stats, tile_grid, gce['bbox'], True, tid_cache, run.id) else: affected_tile_stats = [] if run.id not in processed_runs: processed_runs[run.id] = True collect_samples(run, [global_stats], ['duration', 'area'], accessors, area) if dptr.id not in processed_dptr: processed_dptr[dptr.id] = True collect_samples(dptr, [group_stats], ['duration', 'area'], accessors, area) collect_samples(etr, affected_tile_stats + [task_stats], ['duration', 'area', 'size'], accessors, area) sz = accessors['size'](etr, area) group_stats['size'] += [sz] # Roll-up into provider_task level global_stats['size'] += [sz] # Roll-up into global level # Collect a sample of the megabytes per pixel if has_tiles(etr.name): try: provider = DataProvider.objects.get(name=dptr.name) mpp = compute_mpp(provider, gce['bbox'], etr.result.size) if len(group_stats['mpp']) < MAX_SAMPLES_PER_TARGET: group_stats['mpp'] += [mpp] if len(global_stats['mpp']) < MAX_SAMPLES_PER_TARGET: global_stats['mpp'] += [mpp] for ts in affected_tile_stats: if len(ts['mpp']) < MAX_SAMPLES_PER_TARGET: ts['mpp'] += [mpp] except ObjectDoesNotExist: pass logger.info( 'Computing statistics across %d completed ExportTaskRecords (geom_cache_misses=%d)', export_task_count, _dbg_geom_cache_misses) # TODO: Merge in any auxiliary sample data? if filename is not None: all_stats['timestamp'] = datetime.datetime.now() with open(filename, 'w') as os: json.dump(all_stats, os) totals = { 'run_count': len(processed_runs), 'data_provider_task_count': len(processed_dptr), 'export_task_count': export_task_count } for group_name in all_stats: if group_name in ['timestamp']: continue totals[group_name] = get_summary_stats( all_stats[group_name], ('area', 'duration', 'size', 'mpp')) tile_count = 0 for task_name in all_stats[group_name]: if task_name in ['duration', 'area', 'size', 'mpp']: # These are properties on the roll'ed up statistics continue elif task_name.startswith('tile_'): # Two-level map, index by y then x+z y_s = all_stats[group_name][task_name] total_ys = {} totals[group_name][task_name] = total_ys for xz_s in y_s: total_ys[xz_s] = get_summary_stats( y_s[xz_s], ('area', 'duration', 'size', 'mpp')) total_ys[xz_s]['tile_coord'] = y_s[xz_s]['tile_coord'] tile_count += 1 else: totals[group_name][task_name] = get_summary_stats( all_stats[group_name][task_name], ('area', 'duration', 'size')) totals[group_name]['tile_count'] = tile_count logger.info('Generated statistics for %d tiles for group %s', tile_count, group_name) return totals