Exemple #1
0
def construct_userday_quadtree(
        initial_bounding_box, raw_photo_csv_table, cache_dir,
        max_points_per_node):
    """Construct a spatial quadtree for fast querying of userday points.

    Parameters:
        initial_bounding_box (list of int):
        raw_photo_csv_table ():
        cache_dir (string): path to a directory that can be used to cache
            the quadtree files on disk
        max_points_per_node(int): maximum number of points to allow per node
            of the quadree.  A larger amount will cause the quadtree to
            subdivide.

    Returns:
        None
    """
    LOGGER.info('hashing input file')
    start_time = time.time()
    LOGGER.info(raw_photo_csv_table)
    csv_hash = _hashfile(raw_photo_csv_table, fast_hash=True)

    ooc_qt_picklefilename = os.path.join(cache_dir, csv_hash + '.pickle')
    if os.path.isfile(ooc_qt_picklefilename):
        return ooc_qt_picklefilename
    else:
        LOGGER.info(
            '%s not found, constructing quadtree', ooc_qt_picklefilename)
        LOGGER.info('counting lines in input file')
        total_lines = _file_len(raw_photo_csv_table)
        LOGGER.info('%d lines', total_lines)
        ooc_qt = out_of_core_quadtree.OutOfCoreQuadTree(
            initial_bounding_box, max_points_per_node, GLOBAL_DEPTH,
            cache_dir, pickle_filename=ooc_qt_picklefilename)

        n_parse_processes = multiprocessing.cpu_count() - 1
        if n_parse_processes < 1:
            n_parse_processes = 1

        block_offset_size_queue = multiprocessing.Queue(n_parse_processes * 2)
        numpy_array_queue = multiprocessing.Queue(n_parse_processes * 2)

        LOGGER.info('starting parsing processes')
        for _ in range(n_parse_processes):
            parse_input_csv_process = multiprocessing.Process(
                target=_parse_input_csv, args=(
                    block_offset_size_queue, raw_photo_csv_table,
                    numpy_array_queue))
            parse_input_csv_process.deamon = True
            parse_input_csv_process.start()

        # rush through file and determine reasonable offsets and blocks
        def _populate_offset_queue(block_offset_size_queue):
            csv_file = open(raw_photo_csv_table, 'rb')
            csv_file.readline()  # skip the csv header
            while True:
                start = csv_file.tell()
                csv_file.seek(BLOCKSIZE, 1)
                line = csv_file.readline()  # skip to end of line
                bounds = (start, csv_file.tell() - start)
                block_offset_size_queue.put(bounds)
                if not line:
                    break
            csv_file.close()
            for _ in range(n_parse_processes):
                block_offset_size_queue.put('STOP')

        LOGGER.info('starting offset queue population thread')
        populate_thread = threading.Thread(
            target=_populate_offset_queue, args=(block_offset_size_queue,))
        populate_thread.start()

        LOGGER.info("add points to the quadtree as they are ready")
        last_time = time.time()
        start_time = last_time
        n_points = 0

        while True:
            point_array = numpy_array_queue.get()
            if (isinstance(point_array, str) and
                    point_array == 'STOP'):  # count 'n cpu' STOPs
                n_parse_processes -= 1
                if n_parse_processes == 0:
                    break
                continue

            n_points += len(point_array)
            ooc_qt.add_points(point_array, 0, point_array.size)
            current_time = time.time()
            time_elapsed = current_time - last_time
            if time_elapsed > 5.0:
                LOGGER.info(
                    '%.2f%% complete, %d points skipped, %d nodes in qt in '
                    'only %.2fs', n_points * 100.0 / total_lines,
                    n_points - ooc_qt.n_points(), ooc_qt.n_nodes(),
                    current_time-start_time)
                last_time = time.time()

        # save quadtree to disk
        ooc_qt.flush()
        LOGGER.info(
            '100.00%% complete, %d points skipped, %d nodes in qt in '
            'only %.2fs', n_points - ooc_qt.n_points(), ooc_qt.n_nodes(),
            time.time()-start_time)

        quad_tree_shapefile_name = os.path.join(
            cache_dir, 'quad_tree_shape.shp')

        lat_lng_ref = osr.SpatialReference()
        lat_lng_ref.ImportFromEPSG(4326)  # EPSG 4326 is lat/lng
        LOGGER.info("building quadtree shapefile overview")
        build_quadtree_shape(quad_tree_shapefile_name, ooc_qt, lat_lng_ref)

    populate_thread.join()
    parse_input_csv_process.join()

    LOGGER.info('took %f seconds', (time.time() - start_time))
    return ooc_qt_picklefilename
Exemple #2
0
    def _calc_aggregated_points_in_aoi(
            self, aoi_path, workspace_path, date_range, out_vector_filename):
        """Aggregate the PUD in the AOI.

        Parameters:
            aoi_path (string): a path to an OGR compatible vector.
            workspace_path(string): path to a directory where working files
                can be created
            date_range (datetime 2-tuple): a tuple that contains the inclusive
                start and end date
            out_vector_filename (string): base filename of output vector

        Returns:
            a path to an ESRI shapefile copy of `aoi_path` updated with a
            "PUD" field which contains the metric per polygon.

        """
        aoi_vector = gdal.OpenEx(aoi_path, gdal.OF_VECTOR)
        # append a _pud to the aoi filename
        out_aoi_pud_path = os.path.join(workspace_path, out_vector_filename)

        # start the workers now, because they have to load a quadtree and
        # it will take some time
        poly_test_queue = multiprocessing.Queue()
        pud_poly_feature_queue = multiprocessing.Queue(4)
        n_polytest_processes = multiprocessing.cpu_count()

        global_qt = pickle.load(open(self.qt_pickle_filename, 'rb'))
        aoi_layer = aoi_vector.GetLayer()
        aoi_extent = aoi_layer.GetExtent()
        aoi_ref = aoi_layer.GetSpatialRef()

        # coordinate transformation to convert AOI points to and from lat/lng
        lat_lng_ref = osr.SpatialReference()
        lat_lng_ref.ImportFromEPSG(4326)  # EPSG 4326 is lat/lng

        to_lat_trans = osr.CoordinateTransformation(aoi_ref, lat_lng_ref)
        from_lat_trans = osr.CoordinateTransformation(lat_lng_ref, aoi_ref)

        # calculate x_min transformed by comparing the x coordinate at both
        # the top and bottom of the aoi extent and taking the minimum
        x_min_y_min, _, _ = to_lat_trans.TransformPoint(
            aoi_extent[0], aoi_extent[2])
        x_min_y_max, _, _ = to_lat_trans.TransformPoint(
            aoi_extent[0], aoi_extent[3])
        x_min = min(x_min_y_min, x_min_y_max)

        # calculate x_max transformed by comparing the x coordinate at both
        # the top and bottom of the aoi extent and taking the maximum
        x_max_y_min, _, _ = to_lat_trans.TransformPoint(
            aoi_extent[1], aoi_extent[2])
        x_max_y_max, _, _ = to_lat_trans.TransformPoint(
            aoi_extent[1], aoi_extent[3])
        x_max = max(x_max_y_min, x_max_y_max)

        # calculate y_min transformed by comparing the y coordinate at both
        # the top and bottom of the aoi extent and taking the minimum
        _, y_min_x_min, _ = to_lat_trans.TransformPoint(
            aoi_extent[0], aoi_extent[2])
        _, y_min_x_max, _ = to_lat_trans.TransformPoint(
            aoi_extent[1], aoi_extent[2])
        y_min = min(y_min_x_min, y_min_x_max)

        # calculate y_max transformed by comparing the y coordinate at both
        # the top and bottom of the aoi extent and taking the maximum
        _, y_max_x_min, _ = to_lat_trans.TransformPoint(
            aoi_extent[0], aoi_extent[3])
        _, y_max_x_max, _ = to_lat_trans.TransformPoint(
            aoi_extent[1], aoi_extent[3])
        y_max = max(y_max_x_min, y_max_x_max)

        global_b_box = [x_min, y_min, x_max, y_max]

        local_b_box = [
            aoi_extent[0], aoi_extent[2], aoi_extent[1], aoi_extent[3]]

        LOGGER.info(
            'querying global quadtree against %s', str(global_b_box))
        local_points = global_qt.get_intersecting_points_in_bounding_box(
            global_b_box)
        LOGGER.info('found %d points', len(local_points))

        local_qt_cache_dir = os.path.join(workspace_path, 'local_qt')
        local_qt_pickle_filename = os.path.join(
            local_qt_cache_dir, 'local_qt.pickle')
        os.mkdir(local_qt_cache_dir)

        LOGGER.info('building local quadtree in bounds %s', str(local_b_box))
        local_qt = out_of_core_quadtree.OutOfCoreQuadTree(
            local_b_box, LOCAL_MAX_POINTS_PER_NODE, LOCAL_DEPTH,
            local_qt_cache_dir, pickle_filename=local_qt_pickle_filename)

        LOGGER.info(
            'building local quadtree with %d points', len(local_points))
        last_time = time.time()
        time_elapsed = None
        for point_list_slice_index in range(
                0, len(local_points), POINTS_TO_ADD_PER_STEP):
            time_elapsed = time.time() - last_time
            last_time = recmodel_client.delay_op(
                last_time, LOGGER_TIME_DELAY, lambda: LOGGER.info(
                    '%d out of %d points added to local_qt so far, and '
                    ' n_nodes in qt %d in %.2fs', local_qt.n_points(),
                    len(local_points), local_qt.n_nodes(), time_elapsed))

            projected_point_list = local_points[
                point_list_slice_index:
                point_list_slice_index+POINTS_TO_ADD_PER_STEP]
            for point_index in range(
                    min(len(projected_point_list), POINTS_TO_ADD_PER_STEP)):
                current_point = projected_point_list[point_index]
                # convert to python float types rather than numpy.float32
                lng_coord = float(current_point[2])
                lat_coord = float(current_point[3])
                x_coord, y_coord, _ = from_lat_trans.TransformPoint(
                    lng_coord, lat_coord)
                projected_point_list[point_index] = (
                    current_point[0], current_point[1], x_coord, y_coord)

            local_qt.add_points(
                projected_point_list, 0, len(projected_point_list))
        LOGGER.info('saving local qt to %s', local_qt_pickle_filename)
        local_qt.flush()

        local_quad_tree_shapefile_name = os.path.join(
            local_qt_cache_dir, 'local_qt.shp')

        build_quadtree_shape(
            local_quad_tree_shapefile_name, local_qt, aoi_ref)

        # Start several testing processes
        polytest_process_list = []
        for _ in range(n_polytest_processes):
            polytest_process = multiprocessing.Process(
                target=_calc_poly_pud, args=(
                    local_qt_pickle_filename, aoi_path, date_range,
                    poly_test_queue, pud_poly_feature_queue))
            polytest_process.daemon = True
            polytest_process.start()
            polytest_process_list.append(polytest_process)

        # Copy the input shapefile into the designated output folder
        LOGGER.info('Creating a copy of the input shapefile')
        driver = gdal.GetDriverByName('ESRI Shapefile')
        pud_aoi_vector = driver.CreateCopy(out_aoi_pud_path, aoi_vector)
        pud_aoi_layer = pud_aoi_vector.GetLayer()

        aoi_layer = None
        gdal.Dataset.__swig_destroy__(aoi_vector)
        aoi_vector = None

        pud_id_suffix_list = [
            'YR_AVG', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG',
            'SEP', 'OCT', 'NOV', 'DEC']
        for field_suffix in pud_id_suffix_list:
            field_id = 'PUD_%s' % field_suffix
            # delete the field if it already exists
            field_index = pud_aoi_layer.FindFieldIndex(str(field_id), 1)
            if field_index >= 0:
                pud_aoi_layer.DeleteField(field_index)
            field_defn = ogr.FieldDefn(field_id, ogr.OFTReal)
            field_defn.SetWidth(24)
            field_defn.SetPrecision(11)
            pud_aoi_layer.CreateField(field_defn)

        last_time = time.time()
        LOGGER.info('testing polygons against quadtree')

        # Load up the test queue with polygons
        for poly_feat in pud_aoi_layer:
            poly_test_queue.put(poly_feat.GetFID())

        # Fill the queue with STOPs for each process
        for _ in range(n_polytest_processes):
            poly_test_queue.put('STOP')

        # Read the result until we've seen n_processes_alive
        n_processes_alive = n_polytest_processes
        n_poly_tested = 0

        monthly_table_path = os.path.join(workspace_path, 'monthly_table.csv')
        monthly_table = open(monthly_table_path, 'w')
        date_range_year = [
            date.tolist().timetuple().tm_year for date in date_range]
        table_headers = [
            '%s-%s' % (year, month) for year in range(
                int(date_range_year[0]), int(date_range_year[1])+1)
            for month in range(1, 13)]
        monthly_table.write('poly_id,' + ','.join(table_headers) + '\n')

        while True:
            result_tuple = pud_poly_feature_queue.get()
            n_poly_tested += 1
            if result_tuple == 'STOP':
                n_processes_alive -= 1
                if n_processes_alive == 0:
                    break
                continue
            last_time = recmodel_client.delay_op(
                last_time, LOGGER_TIME_DELAY, lambda: LOGGER.info(
                    '%.2f%% of polygons tested', 100 * float(n_poly_tested) /
                    pud_aoi_layer.GetFeatureCount()))
            poly_id, pud_list, pud_monthly_set = result_tuple
            poly_feat = pud_aoi_layer.GetFeature(poly_id)
            for pud_index, pud_id in enumerate(pud_id_suffix_list):
                poly_feat.SetField('PUD_%s' % pud_id, pud_list[pud_index])
            pud_aoi_layer.SetFeature(poly_feat)

            line = '%s,' % poly_id
            line += (
                ",".join(['%s' % len(pud_monthly_set[header])
                          for header in table_headers]))
            line += '\n'  # final newline
            monthly_table.write(line)

        LOGGER.info('done with polygon test, syncing to disk')
        pud_aoi_layer = None
        pud_aoi_vector.FlushCache()
        gdal.Dataset.__swig_destroy__(pud_aoi_vector)
        pud_aoi_vector = None

        for polytest_process in polytest_process_list:
            polytest_process.join()

        LOGGER.info('returning out shapefile path')
        return out_aoi_pud_path, monthly_table_path