Exemple #1
0
class StreetIndex(object):
    def __init__(self, streets_file):
        self.idx = Index()
        with open(streets_file) as f:
            for line in f.readlines():
                street = json.loads(line)
                street_id = street['properties']['id']
                street_shape = asShape(street['geometry'])
                for i in range(len(street_shape.geoms)):
                    seg_id = self.encode_seg_id(i, street_id)
                    self.idx.insert(seg_id, street_shape.geoms[i].coords[0])
                    self.idx.insert(-seg_id, street_shape.geoms[i].coords[-1])

        self.bb_idx = Index()
        with open(streets_file) as f:
            for line in f.readlines():
                street = json.loads(line)
                street_id = int(street['properties']['id'])
                street_shape = asShape(street['geometry'])
                self.bb_idx.insert(street_id, list(street_shape.bounds))

    def encode_seg_id(self, i, street_id):
        return i * 1000000 + int(street_id)

    def decode_seg_id(self, seg_id):
        i = abs(seg_id) / 1000000
        return abs(seg_id) - i

    def find_nearest_street(self, shape):
        shape = asShape(shape['geometry'])
        shape_type = shape.geom_type
        if shape_type == 'Polygon' or shape_type == 'MultiPolygon':
            ref_point = (
                float(shape.centroid.coords.xy[0][0]),
                float(shape.centroid.coords.xy[1][0])
            )
        else:
            ref_point = (
                float(shape.coords.xy[0][0]),
                float(shape.coords.xy[1][0])
            )
        street_id = list(self.bb_idx.nearest(ref_point))[0]
        return str(street_id)

    def find_connected_street(self, street):
        street_id = int(street['properties']['id'])
        street_shape = asShape(street['geometry'])
        street_start = street_shape.geoms[0].coords[0]
        street_end = street_shape.geoms[-1].coords[-1]
        seg_ids = list(self.idx.intersection(street_start))
        seg_ids += list(self.idx.intersection(street_end))
        street_ids = set(map(self.decode_seg_id, seg_ids))
        if street_id in street_ids:
            street_ids.remove(street_id)
        return street_ids
def demo_delete():
    seed = 1  # Seed for random points

    countries = get_countries()

    country_id_to_remove = 170  # United States of America
    country_uuids_to_remove = []  # Polygons' ids to remove from the index

    properties = Property()
    # properties.writethrough = True
    # properties.leaf_capacity = 1000
    # properties.fill_factor = 0.5
    index = Index(properties=properties)

    points_per_polygon = 1
    points = []

    # Inserts countries data to the index
    for i, (country_name, geometry) in enumerate(countries):
        for polygon in get_polygons(geometry):
            temp_uuid = uuid.uuid1().int
            index.insert(temp_uuid, polygon.bounds, country_name)

            if i == country_id_to_remove:
                # Saves index ids of the polygon to be removed later
                country_uuids_to_remove.append(temp_uuid)

            # Generates random points in every polygon and saves them
            random_points = gen_random_point(points_per_polygon, polygon, seed)
            points.append((country_name, random_points))

    # Checks every generated point has matches
    for (country_name, country_points) in points:
        for point in country_points:
            hits = list(index.intersection(point.bounds, objects=True))
            assert any(hit.object == country_name for hit in hits)

    # Remove geometry
    geometry = countries[country_id_to_remove][1]
    for i, polygon in enumerate(get_polygons(geometry)):
        index.delete(country_uuids_to_remove[i], polygon.bounds)

    points_missing = []

    # Checks (again) if every generated point has matches
    for (country_name, country_points) in points:
        for point in country_points:
            hits = list(index.intersection(point.bounds, objects=True))
            # Save any point without matches
            if not any(hit.object == country_name for hit in hits):
                points_missing.append(str(point) + " - " + country_name)

    # Print missing points
    for point in points_missing:
        print(point)
Exemple #3
0
    def test_tpr(self):
        # TODO : this freezes forever on some windows cloud builds
        if os.name == 'nt':
            return

        # Cartesians list for brute force
        objects = dict()
        tpr_tree = Index(properties=Property(type=RT_TPRTree))

        for operation, t_now, object_ in data_generator():
            if operation == "INSERT":
                tpr_tree.insert(object_.id, object_.get_coordinates())
                objects[object_.id] = object_
            elif operation == "DELETE":
                tpr_tree.delete(object_.id, object_.get_coordinates(t_now))
                del objects[object_.id]
            elif operation == "QUERY":
                tree_intersect = set(
                    tpr_tree.intersection(object_.get_coordinates()))

                # Brute intersect
                brute_intersect = set()
                for tree_object in objects.values():
                    x_low, y_low = tree_object.getXY(object_.start_time)
                    x_high, y_high = tree_object.getXY(object_.end_time)

                    if intersects(
                            x_low, y_low, x_high, y_high,  # Line
                            object_.x, object_.y, object_.dx, object_.dy):  # Rect
                        brute_intersect.add(tree_object.id)

                # Tree should match brute force approach
                assert tree_intersect == brute_intersect
Exemple #4
0
def compute_indicatormatrix(orig,
                            dest,
                            orig_proj='latlong',
                            dest_proj='latlong'):
    """
    Compute the indicatormatrix

    The indicatormatrix I[i,j] is a sparse representation of the ratio
    of the area in orig[j] lying in dest[i], where orig and dest are
    collections of polygons, i.e.

    A value of I[i,j] = 1 indicates that the shape orig[j] is fully
    contained in shape dest[j].

    Note that the polygons must be in the same crs.

    Parameters
    ---------
    orig : Collection of shapely polygons
    dest : Collection of shapely polygons

    Returns
    -------
    I : sp.sparse.lil_matrix
      Indicatormatrix
    """

    dest = reproject_shapes(dest, dest_proj, orig_proj)
    indicator = sp.sparse.lil_matrix((len(dest), len(orig)), dtype=np.float)

    try:
        from rtree.index import Index

        idx = Index()
        for j, o in enumerate(orig):
            idx.insert(j, o.bounds)

        for i, d in enumerate(dest):
            for j in idx.intersection(d.bounds):
                o = orig[j]
                area = d.intersection(o).area
                indicator[i, j] = area / o.area

    except ImportError:
        logger.warning(
            "Rtree is not available. Falling back to slower algorithm.")

        dest_prepped = list(map(prep, dest))

        for i, j in product(range(len(dest)), range(len(orig))):
            if dest_prepped[i].intersects(orig[j]):
                area = dest[i].intersection(orig[j]).area
                indicator[i, j] = area / orig[j].area

    return indicator
Exemple #5
0
class Mesh2D:
    """!
    The general representation of mesh in Serafin 2D.
    The basis for interpolation, volume calculations etc.
    """
    def __init__(self,
                 input_header,
                 construct_index=False,
                 iter_pbar=lambda x: x):
        """!
        @param input_header <slf.Serafin.SerafinHeader>: input Serafin header
        @param construct_index <bool>: perform the index construction
        @param iter_pbar: iterable progress bar
        """
        self.x, self.y = input_header.x[:input_header.
                                        nb_nodes_2d], input_header.y[:
                                                                     input_header
                                                                     .
                                                                     nb_nodes_2d]
        self.ikle = input_header.ikle_2d - 1  # back to 0-based indexing
        self.triangles = {}
        self.nb_points = self.x.shape[0]
        self.nb_triangles = self.ikle.shape[0]
        self.points = np.stack([self.x, self.y], axis=1)
        if not construct_index:
            self.index = Index()
        else:
            self._construct_index(iter_pbar)

    def _construct_index(self, iter_pbar):
        """!
        Separate the index construction from the constructor, allowing a GUI override
        @param iter_pbar: iterable progress bar
        """
        self.index = Index()
        for i, j, k in iter_pbar(self.ikle, unit='elements'):
            t = Polygon([self.points[i], self.points[j], self.points[k]])
            self.triangles[i, j, k] = t
            self.index.insert(i, t.bounds, obj=(i, j, k))

    def get_intersecting_elements(self, bounding_box):
        """!
        @brief Return the triangles in the mesh intersecting the bounding box
        @param bounding_box <tuple>: (left, bottom, right, top) of a 2d geometrical object
        @return <[tuple]>: The list of triangles (i,j,k) intersecting the bounding box
        """
        return list(self.index.intersection(bounding_box, objects='raw'))
Exemple #6
0
def local_search(points, bounding_box, iterations):
    labeled_points = [p for p in points if p.text]

    items = []
    items.extend([p.label for p in labeled_points])
    items.extend(points)
    items.extend(bounding_box.border_config)

    idx = Index()
    for i, item in enumerate(items):
        item.index = i
        idx.insert(item.index, item.box)

    for i in range(iterations):
        for lp in labeled_points:
            best_candidate = None
            min_penalty = None
            for lc1 in lp.label_candidates:
                penalty = POSITION_WEIGHT * lc1.position

                # Check overlap with other labels and points
                intersecting_item_ids = idx.intersection(lc1.box)
                for item_id in intersecting_item_ids:
                    item = items[item_id]
                    if hasattr(item, "point") and lc1.point == item.point:
                        continue
                    penalty += item.overlap(lc1)

                if min_penalty is None or penalty < min_penalty:
                    min_penalty = penalty
                    best_candidate = lc1

            # Remove the old label from the index
            idx.delete(lp.label.index, lp.label.box)

            # Select the new label
            best_candidate.select()

            # Add the new label to the index and item list
            idx.insert(len(items), lp.label.box)
            items.append(lp.label)
Exemple #7
0
    def build_cache(self):
        label_candidates = []
        for p in self.points:
            label_candidates.extend(p.label_candidates)
        items = []
        items.extend(label_candidates)
        items.extend(self.points)
        items.extend(self.bounding_box.border_config)

        idx = Index()
        for i, item in enumerate(items):
            item.index = i
            idx.insert(i, item.box)

        for lc in label_candidates:
            lc.penalty = POSITION_WEIGHT * lc.position
            lc.label_penalties = [0 for i in range(len(label_candidates))]
            intersecting_item_ids = idx.intersection(lc.box)
            bbox_counted = False

            for item_id in intersecting_item_ids:
                item = items[item_id]

                if item == lc or item == lc.point:
                    continue

                if isinstance(item, Label):
                    if lc.point == item.point:
                        continue
                    else:
                        lc.label_penalties[item.index] = item.overlap(lc)
                        continue

                if isinstance(item, BoundingBoxBorder):
                    if bbox_counted:
                        continue
                    bbox_counted = True

                lc.penalty += item.overlap(lc)
Exemple #8
0
class Domain(object):
    '''
    A class used to facilitate computational geometry opperations on a
    domain defined by a closed collection of simplices (e.g., line
    segments or triangular facets). This class can optionally also
    make use of an R-tree which can substantially reduce the
    computational complexity of some operations.

    Parameters
    ----------
    vertices : (n, d) float array
        The vertices making up the domain

    simplices : (m, d) int array
        The connectivity of the vertices
        
    '''
    def __init__(self, vertices, simplices):
        vertices = np.asarray(vertices, dtype=float)
        simplices = np.asarray(simplices, dtype=int)
        assert_shape(vertices, (None, None), 'vertices')
        dim = vertices.shape[1]
        assert_shape(simplices, (None, dim), 'simplices')

        self.vertices = vertices
        self.simplices = simplices
        self.dim = dim     
        self.rtree = None
        self.normals = geo.simplex_normals(vertices, simplices)
        
    def __repr__(self):
        return ('<Domain : '
                'vertex count=%s, '
                'simplex count=%s, '
                'using R-tree=%s>' % 
                (self.vertices.shape[0], 
                 self.simplices.shape[0], 
                 self.rtree is not None))
                
    def __getstate__(self):
        # Define how pickling behaves for this class. The __getstate__
        # and __setstate__ methods are required because `rtree` does
        # not properly pickle. So we instead save a flag indicating
        # whether we need to rebuild `rtree` upon unpickling.

        # create a shallow copy of the instances dict so that we do
        # not mess with its attributes
        state = dict(self.__dict__)
        rtree = state.pop('rtree')
        if rtree is None:
            state['has_rtree'] = False
        else:
            logger.debug(
                'the R-tree cannot be pickled and it will be rebuilt '
                'upon unpickling')
            state['has_rtree'] = True

        return state

    def __setstate__(self, state):
        has_rtree = state.pop('has_rtree')
        self.__dict__ = state
        self.rtree = None
        if has_rtree:
            self.build_rtree()
    
    def build_rtree(self):
        '''
        Construct an R-tree for the domain. This may reduce the
        computational complexity of the methods `intersection_count`,
        `contains`, `orient_simplices`, and `snap`.
        '''
        # create a bounding box for each simplex and add those
        # bounding boxes to the R-tree
        if self.rtree is not None:
            # do nothing because the R-tree already exists
            logger.debug('R-tree already exists')
            return
            
        smp_min = self.vertices[self.simplices].min(axis=1)
        smp_max = self.vertices[self.simplices].max(axis=1)
        bounds = np.hstack((smp_min, smp_max))
        
        p = Property()
        p.dimension = self.dim
        self.rtree = Index(properties=p)
        for i, bnd in enumerate(bounds):
            self.rtree.add(i, bnd)
            
    def orient_simplices(self):
        '''
        Orient the simplices so that the normal vectors point outward.
        '''
        # length scale of the domain
        scale = self.vertices.ptp(axis=0).max()
        dx = 1e-10*scale
        # find the normal for each simplex
        norms = geo.simplex_normals(self.vertices, self.simplices)
        # find the centroid for each simplex
        points = np.mean(self.vertices[self.simplices], axis=1)
        # push points in the direction of the normals
        points += dx*norms
        # find which simplices are oriented such that their normals
        # point inside
        faces_inside = self.contains(points)
        # make a copy of simplices because we are modifying it in
        # place
        new_smp = np.array(self.simplices, copy=True)
        # flip the order of the simplices that are backwards
        flip_smp = new_smp[faces_inside]
        flip_smp[:, [0, 1]] = flip_smp[:, [1, 0]]
        new_smp[faces_inside] = flip_smp

        self.simplices = new_smp
        # remake the normal vectors with the reoriented simplices
        self.normals = geo.simplex_normals(self.vertices, new_smp)

    def intersection_count(self, start_points, end_points):
        '''
        Counts the number times the line segments intersect the
        boundary.

        Parameters
        ----------
        start_points, end_points : (n, d) float array
            The ends of the line segments

        Returns
        -------
        (n,) int array
            The number of boundary intersection

        '''
        start_points = np.asarray(start_points, dtype=float)
        end_points = np.asarray(end_points, dtype=float)
        assert_shape(start_points, (None, self.dim), 'start_points')
        assert_shape(end_points, start_points.shape, 'end_points')
        n = start_points.shape[0]
        
        if self.rtree is None:
            return geo.intersection_count(
                start_points,
                end_points,
                self.vertices,
                self.simplices)

        else:
            out = np.zeros(n, dtype=int)
            # get the bounding boxes around each segment
            bounds = np.hstack((np.minimum(start_points, end_points),
                                np.maximum(start_points, end_points)))   
            for i, bnd in enumerate(bounds):
                # get a list of simplices which could potentially be
                # intersected by segment i
                potential_smpid = list(self.rtree.intersection(bnd))
                if not potential_smpid:
                    # if the segment bounding box does not intersect
                    # and simplex bounding boxes, then there is no
                    # intersection
                    continue
                
                out[[i]] = geo.intersection_count(
                    start_points[[i]],
                    end_points[[i]],
                    self.vertices,
                    self.simplices[potential_smpid])

            return out                    
                    
    def intersection_point(self, start_points, end_points):
        '''
        Finds the point on the boundary intersected by the line
        segments. A `ValueError` is raised if no intersection is
        found.

        Parameters
        ----------
        start_points, end_points : (n, d) float array
            The ends of the line segments

        Returns
        -------
        (n, d) float array
            The intersection point
            
        (n,) int array
            The simplex containing the intersection point

        '''        
        # dont bother using the tree for this one
        return geo.intersection_point(
            start_points, 
            end_points,        
            self.vertices,
            self.simplices)

    def contains(self, points):
        '''
        Identifies whether the points are within the domain

        Parameters
        ----------
        points : (n, d) float array

        Returns
        -------
        (n,) bool array
        
        '''
        points = np.asarray(points, dtype=float)
        assert_shape(points, (None, self.dim), 'points')
        # to find out if the points are inside the domain, we create
        # another set of points which are definitively outside the
        # domain, and then we count the number of boundary
        # intersections between `points` and the new points.

        # get the min value and width of the domain along axis 0
        xwidth = self.vertices[:, 0].ptp()
        xmin = self.vertices[:, 0].min()
        # the outside points are directly to the left of `points` plus
        # a small random perturbation. The subsequent bounding boxes
        # are going to be very narrow, meaning that the R-tree will
        # efficiently winnow down the potential intersecting
        # simplices.
        outside_points = np.array(points, copy=True)
        outside_points[:, 0] = xmin - xwidth
        outside_points += np.random.uniform(
            -0.001*xwidth, 
            0.001*xwidth,
            points.shape)
        count = self.intersection_count(points, outside_points)            
        # If the segment intersects the boundary an odd number of
        # times, then the point is inside the domain, otherwise it is
        # outside
        out = np.array(count % 2, dtype=bool)
        return out

    def snap(self, points, delta=0.5):
        '''
        Snaps `points` to the nearest points on the boundary if they
        are sufficiently close to the boundary. A point is
        sufficiently close if the distance to the boundary is less
        than `delta` times the distance to its nearest neighbor.

        Parameters
        ----------
        points : (n, d) float array

        delta : float, optional

        Returns
        -------
        (n, d) float array
            The new points after snapping to the boundary

        (n,) int array
            The simplex that the points are snapped to. If a point is
            not snapped to the boundary then its corresponding value
            will be -1.
        
        '''
        points = np.asarray(points, dtype=float)
        assert_shape(points, (None, self.dim), 'points')
        n = points.shape[0]

        out_smpid = np.full(n, -1, dtype=int)
        out_points = np.array(points, copy=True)
        nbr_dist = KDTree(points).query(points, 2)[0][:, 1]
        snap_dist = delta*nbr_dist

        if self.rtree is None:
            nrst_pnt, nrst_smpid = geo.nearest_point(
                points,
                self.vertices,
                self.simplices)
            nrst_dist = np.linalg.norm(nrst_pnt - points, axis=1)
            snap = nrst_dist < snap_dist
            out_points[snap] = nrst_pnt[snap]
            out_smpid[snap] = nrst_smpid[snap]

        else:
            # creating bounding boxes around the snapping regions for
            # each point
            bounds = np.hstack((points - snap_dist[:, None],
                                points + snap_dist[:, None]))
            for i, bnd in enumerate(bounds):
                # get a list of simplices which node i could
                # potentially snap to
                potential_smpid = list(self.rtree.intersection(bnd))
                # sort the list to ensure consistent output
                potential_smpid.sort()
                if not potential_smpid: 
                    # no simplices are within the snapping distance
                    continue
                
                # get the nearest point to the potential simplices and
                # the simplex containing the nearest point
                nrst_pnt, nrst_smpid = geo.nearest_point(
                    points[[i]],
                    self.vertices,
                    self.simplices[potential_smpid])
                nrst_dist = np.linalg.norm(points[i] - nrst_pnt[0])
                # if the nearest point is within the snapping distance
                # then snap
                if nrst_dist < snap_dist[i]:
                    out_points[i] = nrst_pnt[0]
                    out_smpid[i] = potential_smpid[nrst_smpid[0]]

        return out_points, out_smpid
Exemple #9
0
    for polygon in polygons:
        index.insert(count, polygon.bounds)
        count += 1

    # recursively loop over every directory
    for root, directories, filenames in os.walk('root'):
        for filename in filenames:
            obj = None
            with open(os.path.join(root, filename), 'r') as f:
                bb = f.readline()
                tpl = eval(bb)
                r = Rect(*tpl)
                # point = Point(*r.centre_point)
                records = []
                # for j in index.nearest(r.rtree_bb(), 1):
                for j in index.intersection(r.rtree_bb()):
                    shapefile = shapefile_records[j]
                    records.append(shapefile)

                if len(records) == 1:
                    super_group = records[0]['properties']['SPRGRP']
                    group = records[0]['properties']['GRP']
                    sub_group = records[0]['properties']['SUBGRP']
                    region = records[0]['properties']['SUB_REGION']
                elif len(records) > 1:
                    # mode
                    super_groups, groups, sub_groups, regions = [], [], [], []
                    for record in records:
                        super_groups.append(record['properties']['SPRGRP'])
                        groups.append(record['properties']['GRP'])
                        sub_groups.append(record['properties']['SUBGRP'])
Exemple #10
0
class RectIndex(object):
    """A R-tree that stores all tracks on a layer."""
    def __init__(self, resolution, basename=None, overwrite=False):
        # type: (float) -> None
        self._res = resolution
        self._cnt = 0
        if basename is None:
            self._index = Index(interleaved=True)
        else:
            p = Property(overwrite=overwrite)
            self._index = Index(basename, interleaved=True, properties=p)

    @property
    def bound_box(self):
        # type: () -> BBox
        xl, yb, xr, yt = self._index.bounds
        return BBox(int(xl),
                    int(yb),
                    int(xr),
                    int(yt),
                    self._res,
                    unit_mode=True)

    def close(self):
        self._index.close()

    def record_box(self, box, dx, dy):
        # type: (BBox, int, int) -> None
        """Record the given BBox."""
        sp_box = box.expand(dx=dx, dy=dy, unit_mode=True)
        bnds = sp_box.get_bounds(unit_mode=True)
        obj = (box.left_unit, box.bottom_unit, box.right_unit, box.top_unit,
               dx, dy)
        self._index.insert(self._cnt, bnds, obj=obj)
        self._cnt += 1

    def rect_iter(self):
        # type: () -> Generator[Tuple[BBox, int, int], None, None]
        for xl, yb, xr, yt, sdx, sdy in self._index.intersection(
                self._index.bounds, objects='raw'):
            box_real = BBox(xl, yb, xr, yt, self._res, unit_mode=True)
            yield box_real, sdx, sdy

    def intersection_iter(self, box, dx=0, dy=0):
        # type: (BBox, int, int) -> Generator[BBox, None, None]
        """Finds all bounding box that intersects the given box."""
        res = self._res
        test_box = box.expand(dx=dx, dy=dy, unit_mode=True)
        box_iter = self._index.intersection(
            test_box.get_bounds(unit_mode=True), objects='raw')
        for xl, yb, xr, yt, sdx, sdy in box_iter:
            box_real = BBox(xl, yb, xr, yt, res, unit_mode=True)
            box_sp = box_real.expand(dx=sdx, dy=sdy, unit_mode=True)
            if box_sp.overlaps(box) or test_box.overlaps(box_real):
                yield box_real.expand(dx=max(dx, sdx),
                                      dy=max(dy, sdy),
                                      unit_mode=True)

    def intersection_rect_iter(self, box):
        # type: (BBox) -> Generator[BBox, None, None]
        """Finds all bounding box that intersects the given box."""
        res = self._res
        box_iter = self._index.intersection(box.get_bounds(unit_mode=True),
                                            objects='raw')
        for xl, yb, xr, yt, sdx, sdy in box_iter:
            yield BBox(xl, yb, xr, yt, res, unit_mode=True)
Exemple #11
0
class DyClee:
    """
    Implementation roughly as per https://doi.org/10.1016/j.patcog.2019.05.024.
    """
    def __init__(self, context: DyCleeContext):
        self.context = context

        self.dense_µclusters: Set[MicroCluster] = Set()
        self.semidense_µclusters: Set[MicroCluster] = Set()
        self.outlier_µclusters: Set[MicroCluster] = Set()
        self.long_term_memory: Set[MicroCluster] = Set()
        self.eliminated: Set[MicroCluster] = Set()

        self.next_µcluster_index: int = 0
        self.next_class_label: int = 0
        self.n_steps: int = 0
        self.last_partitioning_step: int = 0
        self.last_density_step: int = 0

        if self.context.maintain_rtree:
            p = RTreeProperty(dimension=self.context.n_features)
            self.rtree = RTreeIndex(properties=p)
            # This mapping is used to retrieve microcluster objects from their hashes
            # stored with their locations in the R*-tree
            self.µcluster_map: Optional[dict[int, MicroCluster]] = {}
        else:
            self.rtree = None
            self.µcluster_map = None

    @property
    def active_µclusters(self) -> Set[MicroCluster]:
        return self.dense_µclusters | self.semidense_µclusters

    @property
    def all_µclusters(self) -> Set[MicroCluster]:
        return self.active_µclusters | self.outlier_µclusters | self.long_term_memory

    def get_next_µcluster_index(self) -> int:
        index = self.next_µcluster_index
        self.next_µcluster_index += 1
        return index

    def get_next_class_label(self) -> int:
        label = self.next_class_label
        self.next_class_label += 1
        return label

    def update_density_partitions(self, time: Timestamp) -> Set[MicroCluster]:
        densities = np.array(
            [µcluster.density(time) for µcluster in self.all_µclusters])
        mean_density = np.mean(densities)
        median_density = np.median(densities)

        dense: Set[MicroCluster] = Set()
        semidense: Set[MicroCluster] = Set()
        outliers: Set[MicroCluster] = Set()
        memory: Set[MicroCluster] = Set()
        eliminated: Set[MicroCluster] = Set()

        for µcluster in self.all_µclusters:
            density = µcluster.density(time)

            if mean_density <= density >= median_density:
                # Any may become dense
                dense.add(µcluster)
                µcluster.once_dense = True
            elif (µcluster in self.dense_µclusters
                  or µcluster in self.semidense_µclusters
                  or µcluster in self.outlier_µclusters) and (
                      density >= mean_density) != (density >= median_density):
                # Dense and outliers may become dense
                # Semi-dense may stay semi-dense
                semidense.add(µcluster)
            elif ((µcluster in self.dense_µclusters
                   or µcluster in self.semidense_µclusters)
                  and mean_density > density < median_density) or (
                      µcluster in self.outlier_µclusters
                      and density >= self.context.elimination_threshold):
                # Dense and semi-dense may become outliers
                # Outliers may stay outliers
                outliers.add(µcluster)
            elif (self.context.long_term_memory
                  and µcluster in self.outlier_µclusters
                  and µcluster.once_dense):
                # Outliers may be put into long-term memory
                memory.add(µcluster)
            else:
                # If none of the conditions are met, the microcluster is eliminated
                eliminated.add(µcluster)

                if self.context.maintain_rtree:
                    # Remove microcluster from R*-tree
                    self.rtree.delete(hash(µcluster), µcluster.bounding_box)

        # Store the final sets, sorting by index for predictable ordering
        self.dense_µclusters = Set(sorted(dense, key=lambda µ: µ.index))
        self.semidense_µclusters = Set(sorted(semidense,
                                              key=lambda µ: µ.index))
        self.outlier_µclusters = Set(sorted(outliers, key=lambda µ: µ.index))
        self.long_term_memory = Set(sorted(memory, key=lambda µ: µ.index))

        if self.context.store_elements:
            # Keep track of eliminated microclusters (to not lose elements)
            self.eliminated |= eliminated

        return eliminated

    def distance_step(self, element: Element, time: Timestamp) -> MicroCluster:
        if self.context.update_ranges:
            self.context.update_feature_ranges(element)

        if not self.all_µclusters:
            # Create new microcluster
            µcluster = MicroCluster(element,
                                    time,
                                    context=self.context,
                                    index=self.get_next_µcluster_index())
            self.outlier_µclusters.add(µcluster)

            if self.context.maintain_rtree:
                # Add microcluster to R*-tree
                self.µcluster_map[hash(µcluster)] = µcluster
                self.rtree.insert(hash(µcluster), µcluster.bounding_box)

            return µcluster
        else:
            closest: Optional[MicroCluster] = None

            if self.context.distance_index == SpatialIndexMethod.RTREE:
                # The R*-tree searches all microclusters regardless of precedence, so we
                # need to filter by priority after the index search

                # Find all reachable microclusters
                matches: Set[MicroCluster] = Set([
                    self.µcluster_map[hash_]
                    for hash_ in self.rtree.intersection((*element, *element))
                ])

                min_dist = None

                for candidate_µclusters in (self.active_µclusters,
                                            self.outlier_µclusters,
                                            self.long_term_memory):
                    # First match active microclusters, then others

                    for µcluster in matches & candidate_µclusters:
                        dist = µcluster.distance(element)

                        if (closest is None or dist < min_dist or
                            (dist == min_dist and
                             µcluster.density(time) > closest.density(time))):
                            closest = µcluster
                            min_dist = dist
            else:
                for candidate_µclusters in (self.active_µclusters,
                                            self.outlier_µclusters,
                                            self.long_term_memory):
                    # First search actives, then others for reachable microclusters

                    if not candidate_µclusters:
                        continue

                    if self.context.distance_index == SpatialIndexMethod.KDTREE:
                        # Ensure predictable order for indexability
                        candidate_µclusters = list(candidate_µclusters)

                        candidate_centroids: np.ndarray = np.row_stack([
                            µcluster.centroid
                            for µcluster in candidate_µclusters
                        ])

                        # Find potentially reachable microclusters (using L-inf norm)
                        idcs, = KDTree(
                            candidate_centroids, p=np.inf).query_radius(
                                np.reshape(element, (1, -1)),
                                self.context.potentially_reachable_radius)

                        if not len(idcs):
                            continue

                        min_dist = None

                        # Find closest (L-1 norm) microcluster among the reachable ones
                        for i in idcs:
                            µcluster = candidate_µclusters[i]

                            if not µcluster.is_reachable(element):
                                continue

                            dist = µcluster.distance(element)

                            # Higher density is tie-breaker in case of equal distances
                            if (closest is None or dist < min_dist or
                                (dist == min_dist and µcluster.density(time) >
                                 closest.density(time))):
                                closest = µcluster
                                min_dist = dist
                    else:
                        # Brute force
                        min_dist = None

                        for µcluster in candidate_µclusters:
                            if not µcluster.is_reachable(element):
                                continue

                            dist = µcluster.distance(element)

                            if (closest is None or dist < min_dist or
                                (dist == min_dist and µcluster.density(time) >
                                 closest.density(time))):
                                closest = µcluster
                                min_dist = dist

                    if closest is not None:
                        # Match found, no need to check next set
                        break

            if closest is not None:
                if self.context.maintain_rtree:
                    # Remove microcluster from R*-tree
                    self.rtree.delete(hash(closest), closest.bounding_box)

                # Add element to closest microcluster
                closest.add(element, time)

                if self.context.maintain_rtree:
                    # Add modified microcluster to R*-tree
                    self.rtree.insert(hash(closest), closest.bounding_box)

                return closest
            else:
                # Create new microcluster
                µcluster = MicroCluster(element,
                                        time,
                                        context=self.context,
                                        index=self.get_next_µcluster_index())
                self.outlier_µclusters.add(µcluster)

                if self.context.maintain_rtree:
                    # Add microcluster to R*-tree
                    self.µcluster_map[hash(µcluster)] = µcluster
                    self.rtree.insert(hash(µcluster), µcluster.bounding_box)

                return µcluster

    def global_density_step(
            self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]:
        clusters: list[Cluster] = []
        seen: Set[MicroCluster] = Set()

        for µcluster in self.dense_µclusters:
            if µcluster in seen:
                continue

            seen.add(µcluster)

            if µcluster.label is None:
                µcluster.label = self.get_next_class_label()

            cluster = Cluster(µcluster, time)
            clusters.append(cluster)

            # Get dense and semi-dense directly connected neighbours
            connected = µcluster.get_neighbours(
                (self.dense_µclusters | self.semidense_µclusters) - seen,
                rtree_index=self.rtree,
                µcluster_map=self.µcluster_map)

            while connected:
                neighbour = connected.pop()

                if neighbour in seen:
                    continue

                seen.add(neighbour)

                # Outlier microclusters are ignored
                if neighbour in self.outlier_µclusters:
                    continue

                # Dense and semi-dense microclusters become part of the cluster
                neighbour.label = µcluster.label
                cluster.add(neighbour, time)

                # Semi-dense neighbours may only form the boundary
                if neighbour not in self.dense_µclusters:
                    continue

                # Get neighbour's dense and semi-dense directly connected neighbours
                # and add to set of microclusters connected to the parent
                connected |= neighbour.get_neighbours(
                    (self.dense_µclusters | self.semidense_µclusters) - seen,
                    rtree_index=self.rtree,
                    µcluster_map=self.µcluster_map)

        # Find all microclusters that were not grouped into a cluster
        unclustered = self.all_µclusters
        for cluster in clusters:
            unclustered -= cluster.µclusters

        return clusters, unclustered

    def local_density_step(
            self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]:
        raise NotImplementedError("TODO")

    def density_step(
            self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]:
        if self.context.multi_density:
            return self.local_density_step(time)
        else:
            return self.global_density_step(time)

    def step(
        self,
        element: Element,
        time: Timestamp,
        skip_density_step: bool = False
    ) -> tuple[MicroCluster, Optional[list[Cluster]],
               Optional[Set[MicroCluster]], Optional[Set[MicroCluster]]]:
        self.n_steps += 1

        µcluster = self.distance_step(element, time)

        if (self.n_steps >= self.last_partitioning_step +
                self.context.partitioning_interval):
            eliminated = self.update_density_partitions(time)

            self.last_partitioning_step = self.n_steps
        else:
            eliminated = None

        if (not skip_density_step and self.n_steps >=
                self.last_density_step + self.context.density_interval):
            clusters, unclustered = self.density_step(time)

            self.last_density_step = self.n_steps
        else:
            clusters = None
            unclustered = None

        return µcluster, clusters, unclustered, eliminated

    def run(self,
            elements: Iterable[Element],
            times: Optional[Iterable[Timestamp]] = None,
            progress: bool = True) -> list[Cluster]:
        if progress and tqdm is not None:
            elements = tqdm(elements)

        if times is None:
            times = count()

        for element, time in zip(elements, times):
            self.step(element, time, skip_density_step=True)

        clusters, _ = self.density_step(time)

        return clusters
Exemple #12
0
class AdjacencyVersion(object):
    def __init__(self, feature_mapper):
        #self.partitions_complete = partitions_complete
        self.cid = 0
        self.disc_idxs = {}
        self.feature_mapper = feature_mapper
        self.radius = .15
        self.metric = 'hamming'

        self._rtree = None  # internal datastructure
        self._ndim = None
        self.clusters = []
        self.id2c = dict()
        self.c2id = dict()

    def to_json(self):
        data = {
            'clusters': [c and c.__dict__ or None for c in self.clusters],
            'id2c': [(key, c.__dict__) for key, c in self.id2c.items()],
            'c2id': [(c.__dict__, val) for c, val in self.c2id.items()],
            'cid': self.cid,
            '_ndim': self._ndim,
            '_rtreename': 'BLAH'
        }
        return json.dumps(data)

    def from_json(self, encoded):
        data = json.loads(encoded)
        self.clusters = [
            c and Cluster.from_dict(c) or None for c in data['clusters']
        ]
        self.id2c = dict([(key, Cluster.from_dict(val))
                          for key, val in data['id2c']])
        self.c2id = dict([(Cluster.from_dict(key), val)
                          for key, val in data['c2id']])
        self.cid = data['cid']
        self._ndim = data['_ndim']
        self._rtree = None

    def setup_rtree(self, ndim, clusters=None):
        if self._rtree:
            return self._rtree

        self._ndim = ndim
        if not ndim:

            class k(object):
                def __init__(self, graph):
                    self.graph = graph

                def insert(self, *args, **kwargs):
                    pass

                def delete(self, *args, **kwargs):
                    pass

                def intersection(self, *args, **kwargs):
                    return xrange(len(self.graph.clusters))

            self._rtree = k(self)
            return self._rtree

        p = RProp()
        p.dimension = max(2, ndim)
        p.dat_extension = 'data'
        p.idx_extension = 'index'

        if clusters:
            gen_func = ((i, self.bbox_rtree(c, enlarge=0.005), None)
                        for i, c in enumerate(clusters))
            self._rtree = RTree(gen_func, properties=p)
        else:
            self._rtree = RTree(properties=p)
        return self._rtree

    def bbox_rtree(self, cluster, enlarge=0.):
        cols = cluster.cols
        bbox = cluster.bbox
        lower, higher = map(list, bbox)
        if self._ndim == 1:
            lower.append(0)
            higher.append(1)

        if enlarge != 0:
            for idx, col in enumerate(cols):
                rng = enlarge * self.feature_mapper.ranges[col]
                lower[idx] -= rng
                higher[idx] += rng

        bbox = lower + higher
        return bbox

    def insert_rtree(self, idx, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        self._rtree.insert(idx, self.bbox_rtree(cluster))
        return cluster

    def remove_rtree(self, idx, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        self._rtree.delete(idx, self.bbox_rtree(cluster))
        return cluster

    def search_rtree(self, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        bbox = self.bbox_rtree(cluster, enlarge=0.01)
        return self._rtree.intersection(bbox)
        res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)]
        return filter(bool, res)

    def bulk_init(self, clusters):
        if not clusters: return

        self.setup_rtree(len(clusters[0].bbox[0]), clusters)
        self.clusters = clusters
        for cid, c in enumerate(clusters):
            self.id2c[cid] = c
            self.c2id[c] = cid

        for dim in self.feature_mapper.attrs:
            Xs = []
            for cidx, c in enumerate(clusters):
                Xs.append(self.feature_mapper(c, dim))
            idx = NearestNeighbors(radius=self.radius,
                                   algorithm='ball_tree',
                                   metric=self.metric)
            self.disc_idxs[dim] = idx
            self.disc_idxs[dim].fit(np.array(Xs))

    def contains(self, cluster):
        return cluster in self.c2id

    def remove(self, cluster):
        if cluster in self.c2id:
            cid = self.c2id[cluster]
            self.remove_rtree(cid, cluster)
            del self.c2id[cluster]
            del self.id2c[cid]
            self.clusters[cid] = None
            return True
        return False

    def neighbors(self, cluster):
        ret = None
        for name, vals in cluster.discretes.iteritems():
            if name not in self.disc_idxs:
                return []
            vect = self.feature_mapper(cluster, name)
            index = self.disc_idxs[name]
            dists, idxs = index.radius_neighbors(vect, radius=self.radius)
            idxs = set(idxs[0].tolist())

            if ret is None:
                ret = idxs
            else:
                ret.intersection_update(idxs)
                #ret.update(idxs)
            if not ret: return []

        idxs = self.search_rtree(cluster)
        if ret is None:
            ret = set(idxs)
        else:
            ret.intersection_update(set(idxs))

        return filter(bool, [self.clusters[idx] for idx in ret])

    """
Exemple #13
0
def parse_mrt(database: SqliteUtil, path: str, src_epsg: int, prj_epsg: int,
        bounds:int = 30, steps: int = 96):
    log.info('Allocating tables for MRT temperature profiles.')
    create_tables(database)

    log.info('Loading network nodes from database.')
    nodes: Dict[str,Node]
    nodes = load_nodes(database)

    log.info('Loading network links from database.')
    links: Dict[str,Link] 
    links= load_links(database, nodes)

    log.info(f'Searching for mrt files in {path}')
    csvfiles = iter(glob(f'{path}/**/*.csv', recursive=True))

    log.info('Handling initial dataset for profile construction.')
    points: List[Point]
    time: int 
    points, time = parse_points(next(csvfiles), src_epsg, prj_epsg)
    
    log.info('Building spatial index on MRT points.')
    index = Index((point.entry() for point in points))

    log.info('Scanning link bounds and building profiles.')
    mapping: Dict[FrozenSet[int],int] = {}
    count = 0
    empty = 0
    iter_links = counter(links.values(), 'Scanning link %s.')
    for link in iter_links:
        d = link.terminal_node.x * link.source_node.y - \
            link.source_node.x * link.terminal_node.y
        dx = link.terminal_node.x - link.source_node.x
        dy = link.terminal_node.y - link.source_node.y
        l = sqrt(dy * dy + dx * dx)

        nearby = index.intersection(link.bounds(bounds))
        contained = []
        for uuid in nearby:
            point = points[uuid]
            x = point.x
            y = point.y
            if l > 0:
                dist = abs(dy * x - dx * y + d ) / l
            else:
                px = point.x - link.source_node.x
                py = point.y - link.source_node.y
                dist = sqrt(px * px + py * py)
            if dist <= bounds:
                contained.append(point.id)
        
        if contained:
            profile = frozenset(contained)
            if profile in mapping:
                link.profile = mapping[profile]
            else:
                mapping[profile] = count
                link.profile = count
                count += 1
        else:
            empty += 1

    profiles: List[Tuple[int]]
    profiles = [tuple(key) for key in mapping.keys()]

    if empty:
        log.warning(f'Found {empty} links without any MRT temperature profile.')

    def dump_points():
        idx = time // (86400 // steps)
        for uuid, profile in enumerate(profiles):
            mrt, pet, utci = 0, 0, 0
            count = len(profile)
            for ptid in profile:
                point = points[ptid]
                mrt += point.mrt
                pet += point.pet
                utci += point.utci
            yield (uuid, idx, time, mrt / count, pet / count, utci / count)

    def dump_links():
        for link in links.values():
            yield (link.id, link.profile)

    log.info('Writing link updates and temperatures to dataabse.')

    database.insert_values('mrt_temperatures', dump_points(), 6)
    database.insert_values('temp_links', dump_links(), 2)

    log.info('Merging, dropping and renaming old tables.')

    query = '''
        CREATE INDEX temp_links_link
        ON temp_links(link_id);
    '''
    database.cursor.execute(query)
    query = '''
        CREATE TABLE temp_links_merged
        AS SELECT
            links.link_id,
            links.source_node,
            links.terminal_node,
            links.length,
            links.freespeed,
            links.capacity,
            links.permlanes,
            links.oneway,
            links.modes,
            links.air_temperature,
            temp_links.mrt_temperature
        FROM links
        INNER JOIN temp_links
        USING(link_id);
    '''
    database.cursor.execute(query)

    original = database.count_rows('links')
    merged = database.count_rows('temp_links_merged')
    if original != merged:
        log.error('Original links and updated links tables '
            'do not align; quiting to prevent data loss.')
        raise RuntimeError
    
    database.drop_table('links', 'temp_links')
    query = '''
        ALTER TABLE temp_links_merged
        RENAME TO links;
    '''
    database.cursor.execute(query)

    database.connection.commit()

    del links
    del nodes
    del index
    del mapping
    del points

    log.info('Handling remain temperatures with defined profile.')

    def dump_temperaures(time: int, temperatures: List[Tuple[float,float,float]]):
        idx = time // (86400 // steps)
        for uuid, profile in enumerate(profiles):
            mrt, pet, utci = 0, 0, 0
            count = len(profile)
            for tempid in profile:
                temp = temperatures[tempid]
                mrt += temp[0]
                pet += temp[1]
                utci += temp[2]
            yield (uuid, idx, time, mrt / count, pet / count, utci / count)

    for csvfile in csvfiles:
        time: int
        temperatures: List[Tuple[float,float,float]]
        temperatures, time = parse_temperatures(csvfile)

        log.info('Writing temperature data to database.')
        database.insert_values('mrt_temperatures', 
            dump_temperaures(time, temperatures), 6)
        database.connection.commit()

    log.info('Creating indexes on new/updated tables.')
    create_indexes(database)
def main(input_dir, output_dir):
    formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s')
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    city_names = []
    rtree = RTreeIndex()

    cities_filename = os.path.join(tempfile.gettempdir(), 'cities.json')

    subprocess.check_call(['wget', 'https://raw.githubusercontent.com/mapzen/metroextractor-cities/master/cities.json', '-O', cities_filename])

    all_cities = json.load(open(cities_filename))

    i = 0

    for k, v in all_cities['regions'].iteritems():
        for city, data in v['cities'].iteritems():
            bbox = data['bbox']
            rtree.insert(i, (float(bbox['left']), float(bbox['bottom']), float(bbox['right']), float(bbox['top'])))        
            city_names.append(city)
            i += 1

    files = {name: open(os.path.join(output_dir, 'cities', '{}.geojson'.format(name)), 'w') for name in city_names}

    planet = open(os.path.join(output_dir, 'planet.geojson'), 'w')
    planet_addresses_only = open(os.path.join(output_dir, 'planet_addresses_only.json'), 'w')

    i = 0
    seen = set()

    for url, canonical, venues in gen_venues(input_dir):
        domain = urlparse.urlsplit(url).netloc.strip('www.')
        for props in venues:
            lat = props.get('latitude')
            lon = props.get('longitude')
            props['canonical'] = canonical
            props['url'] = url
            street = props.get('street_address')
            name = props.get('name')
            planet_hash = hashlib.md5(u'|'.join((name, street, str(lat), str(lon), domain)).encode('utf-8')).digest()
            address_hash = hashlib.md5(u'|'.join((name, street, domain)).encode('utf-8')).digest()
            props['guid'] = props.get('guid', random_guid())
            venue = venue_to_geojson(props)
            if lat is not None and lon is not None:
                try:
                    lat = float(lat)
                    lon = float(lon)
                except Exception:
                    lat = None
                    lon = None
            if lat is not None and lon is not None and planet_hash not in seen:
                cities = list(rtree.intersection((lon, lat, lon, lat)))
                if cities:
                    for c in cities:
                        f = files[city_names[c]]
                    f.write(json.dumps(venue) + '\n')
                if planet_hash not in seen:
                    planet.write(json.dumps(venue) + '\n')
                    seen.add(planet_hash)
            if address_hash not in seen:
                planet_addresses_only.write(json.dumps(props) + '\n')
                seen.add(address_hash)
            i += 1
            if i % 1000 == 0 and i > 0:
                logger.info('did {}'.format(i))

    logger.info('Creating manifest files')

    manifest_files = []

    for k, v in all_cities['regions'].iteritems():
        for city, data in v['cities'].iteritems():
            f = files[city]
            if f.tell() == 0:
                f.close()
                os.unlink(os.path.join(output_dir, 'cities', '{}.geojson'.format(city)))
                continue

            bbox = data['bbox']
            lat = midpoint(float(bbox['top']), float(bbox['bottom']))
            lon = midpoint(float(bbox['left']), float(bbox['right']))

            manifest_files.append({'latitude': lat, 'longitude': lon, 'file': '{}.geojson'.format(city), 'name': city.replace('_', ', ').replace('-', ' ').title()})

    manifest = {'files': manifest_files}

    json.dump(manifest, open(os.path.join(output_dir, 'manifest.json'), 'w'))

    logger.info('Done!')
Exemple #15
0
class AdjacencyGraph(object):
    def __init__(self, clusters, partitions_complete=True):
        self.partitions_complete = partitions_complete
        self.graph = defaultdict(set)
        self.cid = 0
        self.clusters = []
        self.id2c = dict()
        self.c2id = dict()
        self._rtree = None  # internal datastructure
        self._ndim = None

        self.bulk_init(clusters)

    def to_json(self):
        data = {
                'clusters' : [c and c.__dict__ or None for c in self.clusters],
                'id2c' : [(key, c.__dict__) for key, c in self.id2c.items()],
                'c2id' : [(c.__dict__, val) for c, val in self.c2id.items()],
                'graph' : [(key.__dict__, [val.__dict__ for val in vals]) for key, vals in self.graph.itemsiter()],
                'cid' : self.cid,
                '_ndim' : self._ndim,
                '_rtreename' : 'BLAH'
                }
        return json.dumps(data)

    def from_json(self, encoded):
        data = json.loads(encoded)
        self.clusters = [c and Cluster.from_dict(c) or None for c in data['clusters']]
        self.id2c = dict([(key, Cluster.from_dict(val)) for key, val in data['id2c']])
        self.c2id = dict([(Cluster.from_dict(key), val) for key, val in data['c2id']])
        self.graph = dict([(Cluster.from_dict(key), map(Cluster.from_dict, vals)) for key, vals in data['graph']])
        self.cid = data['cid']
        self._ndim = data['_ndim']
        self._rtree = None

    def setup_rtree(self, ndim, clusters=None):
        if self._rtree:
            return self._rtree

        self._ndim = ndim
        if not ndim:
            class k(object):
                def __init__(self, graph):
                    self.graph = graph
                def insert(self, *args, **kwargs):
                    pass
                def delete(self, *args, **kwargs):
                    pass
                def intersection(self, *args, **kwargs):
                    return xrange(len(self.graph.clusters))
            self._rtree = k(self)
            return self._rtree
 

        p = RProp()
        p.dimension = max(2, ndim)
        p.dat_extension = 'data'
        p.idx_extension = 'index'

        if clusters:
            gen_func = ((i, self.bbox_rtree(c, enlarge=0.00001), None) for i, c in enumerate(clusters))
            self._rtree = RTree(gen_func, properties=p)
        else:
            self._rtree = RTree(properties=p)
        return self._rtree

    def bbox_rtree(self, cluster, enlarge=0.):
        bbox = cluster.bbox
        lower, higher = map(list, bbox)
        if self._ndim == 1:
            lower.append(0)
            higher.append(1)

        if enlarge != 1.:
            lower = [v - enlarge for v in lower]
            higher = [v + enlarge for v in higher]

        bbox = lower + higher
        return bbox

    def insert_rtree(self, idx, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        self._rtree.insert(idx,self.bbox_rtree(cluster))
        return cluster

    def remove_rtree(self, idx, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        self._rtree.delete(idx, self.bbox_rtree(cluster))
        return cluster

    def search_rtree(self, cluster):
        self.setup_rtree(len(cluster.bbox[0]))
        bbox = self.bbox_rtree(cluster, enlarge=0.00001)
        res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)]
        return filter(bool, res)

    def bulk_init(self, clusters):
        if clusters:
            self.setup_rtree(len(clusters[0].bbox[0]), clusters)

        self.clusters.extend(clusters)
        for cid, c in enumerate(clusters):
            self.id2c[cid] = c
            self.c2id[c] = cid

        for idx, c in enumerate(clusters):
            for n in self.search_rtree(c):
                if self.c2id[n] <= idx: continue
                if c.discretes_contains(n) and box_completely_contained(c.bbox, n.bbox): continue
                if not c.adjacent(n, 0.8): continue
                self.graph[c].add(n)
                self.graph[n].add(c)



    def insert(self, cluster):
        if cluster in self.graph:
            return

        self.graph[cluster] = set()
        #for o in self.search_rtree(cluster):
        for o in self.graph.keys():
            if cluster == o:
                continue
            if cluster.adjacent(o, 0.8) or (volume(intersection_box(cluster.bbox, o.bbox)) > 0 and not cluster.contains(o)):
                self.graph[cluster].add(o)
                self.graph[o].add(cluster)
        

        cid = len(self.clusters)
        self.clusters.append(cluster)
        self.id2c[cid] = cluster
        self.c2id[cluster] = cid
        self.insert_rtree(cid, cluster)

    def remove(self, cluster):
        if cluster not in self.graph:
            return

        try:
            for neigh in self.graph[cluster]:
                if not neigh == cluster:
                    self.graph[neigh].remove(cluster)
        except:
            pdb.set_trace()
        del self.graph[cluster]

        cid = self.c2id[cluster]
        self.remove_rtree(cid, cluster)
        del self.c2id[cluster]
        del self.id2c[cid]
        self.clusters[cid] = None

    def neighbors(self, cluster):
        if not self.partitions_complete:
            return filter(bool, self.clusters)

        if cluster in self.graph:
            return self.graph[cluster]

        ret = set()
        intersects = self.search_rtree(cluster)
        for key in filter(cluster.adjacent, intersects):
            if box_completely_contained(key.bbox, cluster.bbox):
                continue
            ret.update(self.graph[key])
        return ret
def parse_parcels(database: SqliteUtil, residence_file: str,
                  commerce_file: str, parcel_file: str, cooling_file: str,
                  src_epsg: int, prj_epsg: int):
    boundaries = {}
    cooling = {}
    parcels = []
    apns = set()

    transformer = Transformer.from_crs(f'epsg:{src_epsg}',
                                       f'epsg:{prj_epsg}',
                                       always_xy=True,
                                       skip_equivalent=True)
    project = transformer.transform

    log.info('Allocating tables for parcels.')
    create_tables(database)

    log.info('Parsing parcel boudaries from shapefile.')
    parser = shapefile.Reader(parcel_file)
    iter_boundaries = counter(iter(parser), 'Parsing parcel boundary %s.')
    for parcel in iter_boundaries:
        if len(parcel.shape.points):
            apn = parcel.record['APN']
            points = (project(*pt) for pt in parcel.shape.points)
            polygon = Polygon(points)
            boundaries[apn] = polygon
    parser.close()

    log.info('Loading cooling information from csv file.')
    with open(cooling_file, 'r') as open_file:
        lines = csv.reader(open_file, delimiter=',', quotechar='"')
        next(lines)
        for desc, _, cool in lines:
            cooling[desc] = bool(cool)

    log.info('Parsing residential parcels from database file.')
    parser = shapefile.Reader(residence_file)
    iter_parcels = counter(parser.iterRecords(),
                           'Parsing residential parcel %s.')
    for record in iter_parcels:
        apn = record['APN']
        if apn in boundaries and apn not in apn:
            cool = True
            polygon = boundaries[apn]
            parcel = Parcel(apn, 'residential', cool, polygon)
            parcels.append(parcel)
            apns.add(apn)
    parser.close()

    log.info('Parsing comercial parcels from database file.')
    parser = shapefile.Reader(commerce_file)
    iter_parcels = counter(parser.iterRecords(),
                           'Parsing commercial parcel %s.')
    for record in iter_parcels:
        apn = record['APN']
        if apn in boundaries and apn not in apns:
            desc = record['DESCRIPT']
            cool = cooling[desc]
            polygon = boundaries[apn]
            parcel = Parcel(apn, 'commercial', cool, polygon)
            parcels.append(parcel)
            apns.add(apn)
    parser.close()

    log.info('Parsing extraneous parcels from shapefile.')
    other = set(boundaries.keys()) - apns
    other = counter(other, 'Parsing extraneous parcel %s.')
    for apn in other:
        polygon = boundaries[apn]
        parcel = Parcel(apn, 'other', True, polygon)
        parcels.append(parcel)

    def load():
        for idx, parcel in enumerate(parcels):
            pt = parcel.polygon.centroid
            yield (idx, (pt.x, pt.y, pt.x, pt.y), None)

    log.info('Building spatial index from parcel data.')
    index = Index(load())

    log.info('Loading network region data.')
    regions = load_regions(database)

    log.info('Scanning regions and mapping mazs to parcels.')
    iter_regions = counter(regions, 'Sacnning region %s.')
    for region in iter_regions:
        apn = f'maz-{region.maz}'
        parcel = Parcel(apn, 'default', True, region.polygon)
        parcel.maz = region.maz
        parcels.append(parcel)
        result = index.intersection(region.polygon.bounds)
        for idx in result:
            parcel = parcels[idx]
            if region.polygon.contains(parcel.polygon.centroid):
                if parcel.maz is not None:
                    warning = 'Parcel %s is in both region %s and %s' \
                        '; the latter region will be kept.'
                    log.warning(warning % (parcel.apn, parcel.maz, region.maz))
                parcel.maz = region.maz
    del regions

    def dump():
        for parcel in parcels:
            yield (parcel.apn, parcel.maz, parcel.kind, int(parcel.cooling),
                   None, None, dumps(parcel.polygon.centroid),
                   dumps(parcel.polygon))

    log.info('Writing parsed parcels to database.')
    database.insert_values('parcels', dump(), 8)
    database.connection.commit()

    log.info('Creating indexes on new tables.')
    create_indexes(database)
Exemple #17
0
def main(input_dir, output_dir):
    formatter = logging.Formatter(
        '%(asctime)s %(levelname)s [%(name)s]: %(message)s')
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

    city_names = []
    rtree = RTreeIndex()

    cities_filename = os.path.join(tempfile.gettempdir(), 'cities.json')

    subprocess.check_call([
        'wget',
        'https://raw.githubusercontent.com/mapzen/metroextractor-cities/master/cities.json',
        '-O', cities_filename
    ])

    all_cities = json.load(open(cities_filename))

    i = 0

    for k, v in all_cities['regions'].iteritems():
        for city, data in v['cities'].iteritems():
            bbox = data['bbox']
            rtree.insert(i, (float(bbox['left']), float(
                bbox['bottom']), float(bbox['right']), float(bbox['top'])))
            city_names.append(city)
            i += 1

    files = {
        name:
        open(os.path.join(output_dir, 'cities', '{}.geojson'.format(name)),
             'w')
        for name in city_names
    }

    planet = open(os.path.join(output_dir, 'planet.geojson'), 'w')
    planet_addresses_only = open(
        os.path.join(output_dir, 'planet_addresses_only.json'), 'w')

    i = 0
    seen = set()

    for url, canonical, venues in gen_venues(input_dir):
        domain = urlparse.urlsplit(url).netloc.strip('www.')
        for props in venues:
            lat = props.get('latitude')
            lon = props.get('longitude')
            props['canonical'] = canonical
            props['url'] = url
            street = props.get('street_address')
            name = props.get('name')
            planet_hash = hashlib.md5(u'|'.join(
                (name, street, str(lat), str(lon),
                 domain)).encode('utf-8')).digest()
            address_hash = hashlib.md5(u'|'.join(
                (name, street, domain)).encode('utf-8')).digest()
            props['guid'] = props.get('guid', random_guid())
            venue = venue_to_geojson(props)
            if lat is not None and lon is not None:
                try:
                    lat = float(lat)
                    lon = float(lon)
                except Exception:
                    lat = None
                    lon = None
            if lat is not None and lon is not None and planet_hash not in seen:
                cities = list(rtree.intersection((lon, lat, lon, lat)))
                if cities:
                    for c in cities:
                        f = files[city_names[c]]
                    f.write(json.dumps(venue) + '\n')
                if planet_hash not in seen:
                    planet.write(json.dumps(venue) + '\n')
                    seen.add(planet_hash)
            if address_hash not in seen:
                planet_addresses_only.write(json.dumps(props) + '\n')
                seen.add(address_hash)
            i += 1
            if i % 1000 == 0 and i > 0:
                logger.info('did {}'.format(i))

    logger.info('Creating manifest files')

    manifest_files = []

    for k, v in all_cities['regions'].iteritems():
        for city, data in v['cities'].iteritems():
            f = files[city]
            if f.tell() == 0:
                f.close()
                os.unlink(
                    os.path.join(output_dir, 'cities',
                                 '{}.geojson'.format(city)))
                continue

            bbox = data['bbox']
            lat = midpoint(float(bbox['top']), float(bbox['bottom']))
            lon = midpoint(float(bbox['left']), float(bbox['right']))

            manifest_files.append({
                'latitude':
                lat,
                'longitude':
                lon,
                'file':
                '{}.geojson'.format(city),
                'name':
                city.replace('_', ', ').replace('-', ' ').title()
            })

    manifest = {'files': manifest_files}

    json.dump(manifest, open(os.path.join(output_dir, 'manifest.json'), 'w'))

    logger.info('Done!')
Exemple #18
0
class AdjacencyVersion(object):

  def __init__(self, feature_mapper):
    #self.partitions_complete = partitions_complete
    self.cid = 0
    self.disc_idxs = {}
    self.feature_mapper = feature_mapper
    self.radius = .15
    self.metric = 'hamming'

    self._rtree = None  # internal datastructure
    self._ndim = None
    self.clusters = []
    self.id2c = dict()
    self.c2id = dict()

  def to_json(self):
    data = {
            'clusters' : [c and c.__dict__ or None for c in self.clusters],
            'id2c' : [(key, c.__dict__) for key, c in self.id2c.items()],
            'c2id' : [(c.__dict__, val) for c, val in self.c2id.items()],
            'cid' : self.cid,
            '_ndim' : self._ndim,
            '_rtreename' : 'BLAH'
            }
    return json.dumps(data)

  def from_json(self, encoded):
    data = json.loads(encoded)
    self.clusters = [c and Cluster.from_dict(c) or None for c in data['clusters']]
    self.id2c = dict([(key, Cluster.from_dict(val)) for key, val in data['id2c']])
    self.c2id = dict([(Cluster.from_dict(key), val) for key, val in data['c2id']])
    self.cid = data['cid']
    self._ndim = data['_ndim']
    self._rtree = None

  def setup_rtree(self, ndim, clusters=None):
    if self._rtree:
        return self._rtree

    self._ndim = ndim
    if not ndim:
        class k(object):
            def __init__(self, graph):
                self.graph = graph
            def insert(self, *args, **kwargs):
                pass
            def delete(self, *args, **kwargs):
                pass
            def intersection(self, *args, **kwargs):
                return xrange(len(self.graph.clusters))
        self._rtree = k(self)
        return self._rtree


    p = RProp()
    p.dimension = max(2, ndim)
    p.dat_extension = 'data'
    p.idx_extension = 'index'

    if clusters:
        gen_func = ((i, self.bbox_rtree(c, enlarge=0.005), None) for i, c in enumerate(clusters))
        self._rtree = RTree(gen_func, properties=p)
    else:
        self._rtree = RTree(properties=p)
    return self._rtree

  def bbox_rtree(self, cluster, enlarge=0.):
    cols = cluster.cols
    bbox = cluster.bbox
    lower, higher = map(list, bbox)
    if self._ndim == 1:
      lower.append(0)
      higher.append(1)

    if enlarge != 0:
      for idx, col in enumerate(cols):
        rng = enlarge * self.feature_mapper.ranges[col]
        lower[idx] -= rng
        higher[idx] += rng

    bbox = lower + higher
    return bbox

  def insert_rtree(self, idx, cluster):
    self.setup_rtree(len(cluster.bbox[0]))
    self._rtree.insert(idx,self.bbox_rtree(cluster))
    return cluster

  def remove_rtree(self, idx, cluster):
    self.setup_rtree(len(cluster.bbox[0]))
    self._rtree.delete(idx, self.bbox_rtree(cluster))
    return cluster

  def search_rtree(self, cluster):
    self.setup_rtree(len(cluster.bbox[0]))
    bbox = self.bbox_rtree(cluster, enlarge=0.01)
    return self._rtree.intersection(bbox)
    res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)]
    return filter(bool, res)

  def bulk_init(self, clusters):
    if not clusters: return

    self.setup_rtree(len(clusters[0].bbox[0]), clusters)
    self.clusters = clusters
    for cid, c in enumerate(clusters):
      self.id2c[cid] = c
      self.c2id[c] = cid
    
    for dim in self.feature_mapper.attrs:
      Xs = []
      for cidx, c in enumerate(clusters):
        Xs.append(self.feature_mapper(c, dim))
      idx = NearestNeighbors(
          radius=self.radius, 
          algorithm='ball_tree', 
          metric=self.metric
      )
      self.disc_idxs[dim] = idx
      self.disc_idxs[dim].fit(np.array(Xs))

  def contains(self, cluster):
    return cluster in self.c2id
  
  def remove(self, cluster):
    if cluster in self.c2id:
      cid = self.c2id[cluster]
      self.remove_rtree(cid, cluster)
      del self.c2id[cluster]
      del self.id2c[cid]
      self.clusters[cid] = None
      return True
    return False


  def neighbors(self, cluster):
    ret = None
    for name, vals in cluster.discretes.iteritems():
      if name not in self.disc_idxs:
        return []
      vect = self.feature_mapper(cluster, name)
      index = self.disc_idxs[name]
      dists, idxs = index.radius_neighbors(vect, radius=self.radius)
      idxs = set(idxs[0].tolist())

      if ret is None:
        ret = idxs
      else:
        ret.intersection_update(idxs)
        #ret.update(idxs)
      if not ret: return []

    idxs = self.search_rtree(cluster)
    if ret is None:
      ret = set(idxs)
    else:
      ret.intersection_update(set(idxs))

    return filter(bool, [self.clusters[idx] for idx in ret])


  """
class SpatialIndex():
    """

    A spatial index is a type of extended index that allows you to index a
    spatial column. A spatial column is a table column that contains data of a
    spatial data type.

    Spatial indexes help to improve spatial query performance on a dataframe.
    Identifying a feature, selecting features, and joining data all have better
    performace when using spatial indexing.


    ====================     ==================================================
    Arguement                Description
    --------------------     --------------------------------------------------
    stype                    Required String. This sets the type of spatial
                             index being used by the user. The current types of
                             spatial indexes are: custom, rtree and quadtree.
    --------------------     --------------------------------------------------
    bbox                     Optional Tuple. The extent of the spatial data as:
                             (xmin, ymin, xmax, ymax). This parameter is required
                             if a QuadTree Spatial Index is being used.

                             Example:
                             bbox=(-100, -50, 100, 50)
    --------------------     --------------------------------------------------
    filename                 Optional String. The name of the spatial index
                             file. This is only supported by rtree spatial
                             indexes. For large datasets an rtree index can be
                             saved to disk and used at a later time. If this is
                             not provided the r-tree index will be in-memory.
    --------------------     --------------------------------------------------
    custom_index             Optional Object. Sometimes QuadTree and Rtree
                             indexing is not enough. A custom spatial index class
                             can be giving to the SpatialIndex class and used
                             using encapsulation.  The custom index must have two
                             methods: `intersect` that excepts a tuple, and
                             `insert` which must accept an oid and a bounding
                             box. This object is required when `stype` of
                             'custom' is specified.
    ====================     ==================================================


    """
    _stype = None
    _bbox = None
    _index = None
    _df = None

    #----------------------------------------------------------------------
    def __init__(self, stype, bbox=None, **kwargs):
        """initializer"""
        ci = kwargs.pop('custom_index', None)
        self._filename = kwargs.pop('filename', None)
        self._bbox = bbox
        self._stype = stype.lower()
        self._df = None
        if ci and stype.lower() == 'custom':
            self._index = ci
        elif stype.lower() == 'quadtree' and bbox:
            self._index = QIndex(bbox=bbox)
        elif RIndex and stype.lower() == 'rtree':
            self._index = RIndex(self._filename)
        else:
            raise ValueError("Could not create the spatial index.")

    #----------------------------------------------------------------------
    def intersect(self, bbox):
        """
        Returns the spatial features that intersect the bbox

        :bbox: tuple - (xmin,ymin,xmax,ymax)

        :returns: list
        """
        if self._stype.lower() in ['rtree']:
            return list(self._index.intersection(bbox))
        elif self._stype.lower() in ['quadtree']:
            return list(self._index.intersect(bbox=bbox))
        else:
            return list(self._index.intersect(bbox))

    #----------------------------------------------------------------------
    def insert(self, oid, bbox):
        """
        Inserts the entry into the spatial index

        :oid: unique id
        :bbox: tuple - (xmin,ymin,xmax,ymax)
        """
        if self._index is None:
            raise Exception(("Could not insert into a spatial index because "
                             "it does not exist."))
        if self._stype == 'rtree' and \
           HASRTREE and \
           isinstance(self._index, RIndex):

            r = self._index.insert(id=oid, coordinates=bbox, obj=None)
            self.flush()
            return r
        elif self._stype.lower() == 'quadtree':
            return self._index.insert(item=oid, bbox=bbox)
        elif self._stype.lower() == 'custom':
            r = self._index.intersect(oid, bbox)
            self.flush()
            return r

    #----------------------------------------------------------------------
    def flush(self):
        """
        Saves the index to disk if a filename is given for an R-Tree Spatial Index.

        **This applies only to the R-Tree implementation of the spatial index.**

        :returns: Boolean

        """
        if hasattr(self._index, 'flush'):
            getattr(self._index, 'flush')()
        elif self._stype == 'rtree' and \
             self._filename:
            self._index.close()
            self._index = RIndex(self._filename)
        else:
            return False
        return True
Exemple #20
0
class RTreeTest(unittest.TestCase):
    def Xtest_insertion(self):
        repeat = 10
        basen = 100
        boxes = [Box(i) for i in range(basen)]
        t = timeit.Timer(lambda: self.insert_boxes(boxes),
                         setup=lambda: self.create_index_data([]))
        print(t.timeit(number=repeat) / repeat)

        n = 100000
        prior_boxes = [Box(i) for i in range(n)]
        boxes = [Box(i) for i in range(n, n + basen)]
        t = timeit.Timer(
            lambda: self.insert_boxes(boxes),
            setup=lambda: self.create_index_data(prior_boxes),
        )
        print(t.timeit(number=repeat) / repeat)

    def Xtest_creation(self):
        repeat = 10
        basen = 100
        boxes = [Box(i) for i in range(basen)]
        t = timeit.Timer(lambda: self.create_index_data(boxes))
        t0 = t.timeit(number=repeat) / repeat
        print(basen, t0)
        for i in range(6):
            m = 2**(i + 1)
            n = m * basen
            boxes = [Box(i) for i in range(n)]
            t = timeit.Timer(lambda: self.create_index_data(boxes))
            t1 = t.timeit(number=repeat) / repeat
            print(n, m, t1, t1 / t0)

    def Xtest_stream(self):
        repeat = 10
        n = 10000

        boxes = []
        for i in range(n):
            boxes.append(Box(i))

        def box_generator():
            for b in boxes:
                yield (b.index, b.box, b.index)

        t = timeit.Timer(lambda: self.create_index_data(boxes))
        print(t.timeit(number=repeat) / repeat)

        t = timeit.Timer(lambda: self.create_index_stream(box_generator()))
        print(t.timeit(number=repeat) / repeat)

    def Xtest_query(self):
        repeat = 10
        boxes = [Box(i) for i in range(100)]
        self.create_index_data(boxes)
        test_boxes = random.sample(boxes, 10)

        t = timeit.Timer(lambda: self.query_index(test_boxes))
        print(t.timeit(number=repeat) / repeat)

        boxes = [Box(i) for i in range(100000)]
        self.create_index_data(boxes)
        test_boxes = random.sample(boxes, 10)

        t = timeit.Timer(lambda: self.query_index(test_boxes))
        print(t.timeit(number=repeat) / repeat)

    def insert_boxes(self, boxes):
        for b in boxes:
            self.idx.insert(b.index, b.box)

    def create_index_data(self, data):
        self.idx = Index()
        for d in data:
            self.idx.insert(d.index, d.box, d.index)

    def create_index_stream(self, generator):
        self.idx = Index(generator)

    def query_index(self, boxes):
        for b in boxes:
            overlapping_boxes = self.idx.intersection(b.box)