def construct_userday_quadtree( initial_bounding_box, raw_photo_csv_table, cache_dir, max_points_per_node): """Construct a spatial quadtree for fast querying of userday points. Parameters: initial_bounding_box (list of int): raw_photo_csv_table (): cache_dir (string): path to a directory that can be used to cache the quadtree files on disk max_points_per_node(int): maximum number of points to allow per node of the quadree. A larger amount will cause the quadtree to subdivide. Returns: None """ LOGGER.info('hashing input file') start_time = time.time() LOGGER.info(raw_photo_csv_table) csv_hash = _hashfile(raw_photo_csv_table, fast_hash=True) ooc_qt_picklefilename = os.path.join(cache_dir, csv_hash + '.pickle') if os.path.isfile(ooc_qt_picklefilename): return ooc_qt_picklefilename else: LOGGER.info( '%s not found, constructing quadtree', ooc_qt_picklefilename) LOGGER.info('counting lines in input file') total_lines = _file_len(raw_photo_csv_table) LOGGER.info('%d lines', total_lines) ooc_qt = out_of_core_quadtree.OutOfCoreQuadTree( initial_bounding_box, max_points_per_node, GLOBAL_DEPTH, cache_dir, pickle_filename=ooc_qt_picklefilename) n_parse_processes = multiprocessing.cpu_count() - 1 if n_parse_processes < 1: n_parse_processes = 1 block_offset_size_queue = multiprocessing.Queue(n_parse_processes * 2) numpy_array_queue = multiprocessing.Queue(n_parse_processes * 2) LOGGER.info('starting parsing processes') for _ in range(n_parse_processes): parse_input_csv_process = multiprocessing.Process( target=_parse_input_csv, args=( block_offset_size_queue, raw_photo_csv_table, numpy_array_queue)) parse_input_csv_process.deamon = True parse_input_csv_process.start() # rush through file and determine reasonable offsets and blocks def _populate_offset_queue(block_offset_size_queue): csv_file = open(raw_photo_csv_table, 'rb') csv_file.readline() # skip the csv header while True: start = csv_file.tell() csv_file.seek(BLOCKSIZE, 1) line = csv_file.readline() # skip to end of line bounds = (start, csv_file.tell() - start) block_offset_size_queue.put(bounds) if not line: break csv_file.close() for _ in range(n_parse_processes): block_offset_size_queue.put('STOP') LOGGER.info('starting offset queue population thread') populate_thread = threading.Thread( target=_populate_offset_queue, args=(block_offset_size_queue,)) populate_thread.start() LOGGER.info("add points to the quadtree as they are ready") last_time = time.time() start_time = last_time n_points = 0 while True: point_array = numpy_array_queue.get() if (isinstance(point_array, str) and point_array == 'STOP'): # count 'n cpu' STOPs n_parse_processes -= 1 if n_parse_processes == 0: break continue n_points += len(point_array) ooc_qt.add_points(point_array, 0, point_array.size) current_time = time.time() time_elapsed = current_time - last_time if time_elapsed > 5.0: LOGGER.info( '%.2f%% complete, %d points skipped, %d nodes in qt in ' 'only %.2fs', n_points * 100.0 / total_lines, n_points - ooc_qt.n_points(), ooc_qt.n_nodes(), current_time-start_time) last_time = time.time() # save quadtree to disk ooc_qt.flush() LOGGER.info( '100.00%% complete, %d points skipped, %d nodes in qt in ' 'only %.2fs', n_points - ooc_qt.n_points(), ooc_qt.n_nodes(), time.time()-start_time) quad_tree_shapefile_name = os.path.join( cache_dir, 'quad_tree_shape.shp') lat_lng_ref = osr.SpatialReference() lat_lng_ref.ImportFromEPSG(4326) # EPSG 4326 is lat/lng LOGGER.info("building quadtree shapefile overview") build_quadtree_shape(quad_tree_shapefile_name, ooc_qt, lat_lng_ref) populate_thread.join() parse_input_csv_process.join() LOGGER.info('took %f seconds', (time.time() - start_time)) return ooc_qt_picklefilename
def _calc_aggregated_points_in_aoi( self, aoi_path, workspace_path, date_range, out_vector_filename): """Aggregate the PUD in the AOI. Parameters: aoi_path (string): a path to an OGR compatible vector. workspace_path(string): path to a directory where working files can be created date_range (datetime 2-tuple): a tuple that contains the inclusive start and end date out_vector_filename (string): base filename of output vector Returns: a path to an ESRI shapefile copy of `aoi_path` updated with a "PUD" field which contains the metric per polygon. """ aoi_vector = gdal.OpenEx(aoi_path, gdal.OF_VECTOR) # append a _pud to the aoi filename out_aoi_pud_path = os.path.join(workspace_path, out_vector_filename) # start the workers now, because they have to load a quadtree and # it will take some time poly_test_queue = multiprocessing.Queue() pud_poly_feature_queue = multiprocessing.Queue(4) n_polytest_processes = multiprocessing.cpu_count() global_qt = pickle.load(open(self.qt_pickle_filename, 'rb')) aoi_layer = aoi_vector.GetLayer() aoi_extent = aoi_layer.GetExtent() aoi_ref = aoi_layer.GetSpatialRef() # coordinate transformation to convert AOI points to and from lat/lng lat_lng_ref = osr.SpatialReference() lat_lng_ref.ImportFromEPSG(4326) # EPSG 4326 is lat/lng to_lat_trans = osr.CoordinateTransformation(aoi_ref, lat_lng_ref) from_lat_trans = osr.CoordinateTransformation(lat_lng_ref, aoi_ref) # calculate x_min transformed by comparing the x coordinate at both # the top and bottom of the aoi extent and taking the minimum x_min_y_min, _, _ = to_lat_trans.TransformPoint( aoi_extent[0], aoi_extent[2]) x_min_y_max, _, _ = to_lat_trans.TransformPoint( aoi_extent[0], aoi_extent[3]) x_min = min(x_min_y_min, x_min_y_max) # calculate x_max transformed by comparing the x coordinate at both # the top and bottom of the aoi extent and taking the maximum x_max_y_min, _, _ = to_lat_trans.TransformPoint( aoi_extent[1], aoi_extent[2]) x_max_y_max, _, _ = to_lat_trans.TransformPoint( aoi_extent[1], aoi_extent[3]) x_max = max(x_max_y_min, x_max_y_max) # calculate y_min transformed by comparing the y coordinate at both # the top and bottom of the aoi extent and taking the minimum _, y_min_x_min, _ = to_lat_trans.TransformPoint( aoi_extent[0], aoi_extent[2]) _, y_min_x_max, _ = to_lat_trans.TransformPoint( aoi_extent[1], aoi_extent[2]) y_min = min(y_min_x_min, y_min_x_max) # calculate y_max transformed by comparing the y coordinate at both # the top and bottom of the aoi extent and taking the maximum _, y_max_x_min, _ = to_lat_trans.TransformPoint( aoi_extent[0], aoi_extent[3]) _, y_max_x_max, _ = to_lat_trans.TransformPoint( aoi_extent[1], aoi_extent[3]) y_max = max(y_max_x_min, y_max_x_max) global_b_box = [x_min, y_min, x_max, y_max] local_b_box = [ aoi_extent[0], aoi_extent[2], aoi_extent[1], aoi_extent[3]] LOGGER.info( 'querying global quadtree against %s', str(global_b_box)) local_points = global_qt.get_intersecting_points_in_bounding_box( global_b_box) LOGGER.info('found %d points', len(local_points)) local_qt_cache_dir = os.path.join(workspace_path, 'local_qt') local_qt_pickle_filename = os.path.join( local_qt_cache_dir, 'local_qt.pickle') os.mkdir(local_qt_cache_dir) LOGGER.info('building local quadtree in bounds %s', str(local_b_box)) local_qt = out_of_core_quadtree.OutOfCoreQuadTree( local_b_box, LOCAL_MAX_POINTS_PER_NODE, LOCAL_DEPTH, local_qt_cache_dir, pickle_filename=local_qt_pickle_filename) LOGGER.info( 'building local quadtree with %d points', len(local_points)) last_time = time.time() time_elapsed = None for point_list_slice_index in range( 0, len(local_points), POINTS_TO_ADD_PER_STEP): time_elapsed = time.time() - last_time last_time = recmodel_client.delay_op( last_time, LOGGER_TIME_DELAY, lambda: LOGGER.info( '%d out of %d points added to local_qt so far, and ' ' n_nodes in qt %d in %.2fs', local_qt.n_points(), len(local_points), local_qt.n_nodes(), time_elapsed)) projected_point_list = local_points[ point_list_slice_index: point_list_slice_index+POINTS_TO_ADD_PER_STEP] for point_index in range( min(len(projected_point_list), POINTS_TO_ADD_PER_STEP)): current_point = projected_point_list[point_index] # convert to python float types rather than numpy.float32 lng_coord = float(current_point[2]) lat_coord = float(current_point[3]) x_coord, y_coord, _ = from_lat_trans.TransformPoint( lng_coord, lat_coord) projected_point_list[point_index] = ( current_point[0], current_point[1], x_coord, y_coord) local_qt.add_points( projected_point_list, 0, len(projected_point_list)) LOGGER.info('saving local qt to %s', local_qt_pickle_filename) local_qt.flush() local_quad_tree_shapefile_name = os.path.join( local_qt_cache_dir, 'local_qt.shp') build_quadtree_shape( local_quad_tree_shapefile_name, local_qt, aoi_ref) # Start several testing processes polytest_process_list = [] for _ in range(n_polytest_processes): polytest_process = multiprocessing.Process( target=_calc_poly_pud, args=( local_qt_pickle_filename, aoi_path, date_range, poly_test_queue, pud_poly_feature_queue)) polytest_process.daemon = True polytest_process.start() polytest_process_list.append(polytest_process) # Copy the input shapefile into the designated output folder LOGGER.info('Creating a copy of the input shapefile') driver = gdal.GetDriverByName('ESRI Shapefile') pud_aoi_vector = driver.CreateCopy(out_aoi_pud_path, aoi_vector) pud_aoi_layer = pud_aoi_vector.GetLayer() aoi_layer = None gdal.Dataset.__swig_destroy__(aoi_vector) aoi_vector = None pud_id_suffix_list = [ 'YR_AVG', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] for field_suffix in pud_id_suffix_list: field_id = 'PUD_%s' % field_suffix # delete the field if it already exists field_index = pud_aoi_layer.FindFieldIndex(str(field_id), 1) if field_index >= 0: pud_aoi_layer.DeleteField(field_index) field_defn = ogr.FieldDefn(field_id, ogr.OFTReal) field_defn.SetWidth(24) field_defn.SetPrecision(11) pud_aoi_layer.CreateField(field_defn) last_time = time.time() LOGGER.info('testing polygons against quadtree') # Load up the test queue with polygons for poly_feat in pud_aoi_layer: poly_test_queue.put(poly_feat.GetFID()) # Fill the queue with STOPs for each process for _ in range(n_polytest_processes): poly_test_queue.put('STOP') # Read the result until we've seen n_processes_alive n_processes_alive = n_polytest_processes n_poly_tested = 0 monthly_table_path = os.path.join(workspace_path, 'monthly_table.csv') monthly_table = open(monthly_table_path, 'w') date_range_year = [ date.tolist().timetuple().tm_year for date in date_range] table_headers = [ '%s-%s' % (year, month) for year in range( int(date_range_year[0]), int(date_range_year[1])+1) for month in range(1, 13)] monthly_table.write('poly_id,' + ','.join(table_headers) + '\n') while True: result_tuple = pud_poly_feature_queue.get() n_poly_tested += 1 if result_tuple == 'STOP': n_processes_alive -= 1 if n_processes_alive == 0: break continue last_time = recmodel_client.delay_op( last_time, LOGGER_TIME_DELAY, lambda: LOGGER.info( '%.2f%% of polygons tested', 100 * float(n_poly_tested) / pud_aoi_layer.GetFeatureCount())) poly_id, pud_list, pud_monthly_set = result_tuple poly_feat = pud_aoi_layer.GetFeature(poly_id) for pud_index, pud_id in enumerate(pud_id_suffix_list): poly_feat.SetField('PUD_%s' % pud_id, pud_list[pud_index]) pud_aoi_layer.SetFeature(poly_feat) line = '%s,' % poly_id line += ( ",".join(['%s' % len(pud_monthly_set[header]) for header in table_headers])) line += '\n' # final newline monthly_table.write(line) LOGGER.info('done with polygon test, syncing to disk') pud_aoi_layer = None pud_aoi_vector.FlushCache() gdal.Dataset.__swig_destroy__(pud_aoi_vector) pud_aoi_vector = None for polytest_process in polytest_process_list: polytest_process.join() LOGGER.info('returning out shapefile path') return out_aoi_pud_path, monthly_table_path