class StreetIndex(object): def __init__(self, streets_file): self.idx = Index() with open(streets_file) as f: for line in f.readlines(): street = json.loads(line) street_id = street['properties']['id'] street_shape = asShape(street['geometry']) for i in range(len(street_shape.geoms)): seg_id = self.encode_seg_id(i, street_id) self.idx.insert(seg_id, street_shape.geoms[i].coords[0]) self.idx.insert(-seg_id, street_shape.geoms[i].coords[-1]) self.bb_idx = Index() with open(streets_file) as f: for line in f.readlines(): street = json.loads(line) street_id = int(street['properties']['id']) street_shape = asShape(street['geometry']) self.bb_idx.insert(street_id, list(street_shape.bounds)) def encode_seg_id(self, i, street_id): return i * 1000000 + int(street_id) def decode_seg_id(self, seg_id): i = abs(seg_id) / 1000000 return abs(seg_id) - i def find_nearest_street(self, shape): shape = asShape(shape['geometry']) shape_type = shape.geom_type if shape_type == 'Polygon' or shape_type == 'MultiPolygon': ref_point = ( float(shape.centroid.coords.xy[0][0]), float(shape.centroid.coords.xy[1][0]) ) else: ref_point = ( float(shape.coords.xy[0][0]), float(shape.coords.xy[1][0]) ) street_id = list(self.bb_idx.nearest(ref_point))[0] return str(street_id) def find_connected_street(self, street): street_id = int(street['properties']['id']) street_shape = asShape(street['geometry']) street_start = street_shape.geoms[0].coords[0] street_end = street_shape.geoms[-1].coords[-1] seg_ids = list(self.idx.intersection(street_start)) seg_ids += list(self.idx.intersection(street_end)) street_ids = set(map(self.decode_seg_id, seg_ids)) if street_id in street_ids: street_ids.remove(street_id) return street_ids
def demo_delete(): seed = 1 # Seed for random points countries = get_countries() country_id_to_remove = 170 # United States of America country_uuids_to_remove = [] # Polygons' ids to remove from the index properties = Property() # properties.writethrough = True # properties.leaf_capacity = 1000 # properties.fill_factor = 0.5 index = Index(properties=properties) points_per_polygon = 1 points = [] # Inserts countries data to the index for i, (country_name, geometry) in enumerate(countries): for polygon in get_polygons(geometry): temp_uuid = uuid.uuid1().int index.insert(temp_uuid, polygon.bounds, country_name) if i == country_id_to_remove: # Saves index ids of the polygon to be removed later country_uuids_to_remove.append(temp_uuid) # Generates random points in every polygon and saves them random_points = gen_random_point(points_per_polygon, polygon, seed) points.append((country_name, random_points)) # Checks every generated point has matches for (country_name, country_points) in points: for point in country_points: hits = list(index.intersection(point.bounds, objects=True)) assert any(hit.object == country_name for hit in hits) # Remove geometry geometry = countries[country_id_to_remove][1] for i, polygon in enumerate(get_polygons(geometry)): index.delete(country_uuids_to_remove[i], polygon.bounds) points_missing = [] # Checks (again) if every generated point has matches for (country_name, country_points) in points: for point in country_points: hits = list(index.intersection(point.bounds, objects=True)) # Save any point without matches if not any(hit.object == country_name for hit in hits): points_missing.append(str(point) + " - " + country_name) # Print missing points for point in points_missing: print(point)
def test_tpr(self): # TODO : this freezes forever on some windows cloud builds if os.name == 'nt': return # Cartesians list for brute force objects = dict() tpr_tree = Index(properties=Property(type=RT_TPRTree)) for operation, t_now, object_ in data_generator(): if operation == "INSERT": tpr_tree.insert(object_.id, object_.get_coordinates()) objects[object_.id] = object_ elif operation == "DELETE": tpr_tree.delete(object_.id, object_.get_coordinates(t_now)) del objects[object_.id] elif operation == "QUERY": tree_intersect = set( tpr_tree.intersection(object_.get_coordinates())) # Brute intersect brute_intersect = set() for tree_object in objects.values(): x_low, y_low = tree_object.getXY(object_.start_time) x_high, y_high = tree_object.getXY(object_.end_time) if intersects( x_low, y_low, x_high, y_high, # Line object_.x, object_.y, object_.dx, object_.dy): # Rect brute_intersect.add(tree_object.id) # Tree should match brute force approach assert tree_intersect == brute_intersect
def compute_indicatormatrix(orig, dest, orig_proj='latlong', dest_proj='latlong'): """ Compute the indicatormatrix The indicatormatrix I[i,j] is a sparse representation of the ratio of the area in orig[j] lying in dest[i], where orig and dest are collections of polygons, i.e. A value of I[i,j] = 1 indicates that the shape orig[j] is fully contained in shape dest[j]. Note that the polygons must be in the same crs. Parameters --------- orig : Collection of shapely polygons dest : Collection of shapely polygons Returns ------- I : sp.sparse.lil_matrix Indicatormatrix """ dest = reproject_shapes(dest, dest_proj, orig_proj) indicator = sp.sparse.lil_matrix((len(dest), len(orig)), dtype=np.float) try: from rtree.index import Index idx = Index() for j, o in enumerate(orig): idx.insert(j, o.bounds) for i, d in enumerate(dest): for j in idx.intersection(d.bounds): o = orig[j] area = d.intersection(o).area indicator[i, j] = area / o.area except ImportError: logger.warning( "Rtree is not available. Falling back to slower algorithm.") dest_prepped = list(map(prep, dest)) for i, j in product(range(len(dest)), range(len(orig))): if dest_prepped[i].intersects(orig[j]): area = dest[i].intersection(orig[j]).area indicator[i, j] = area / orig[j].area return indicator
class Mesh2D: """! The general representation of mesh in Serafin 2D. The basis for interpolation, volume calculations etc. """ def __init__(self, input_header, construct_index=False, iter_pbar=lambda x: x): """! @param input_header <slf.Serafin.SerafinHeader>: input Serafin header @param construct_index <bool>: perform the index construction @param iter_pbar: iterable progress bar """ self.x, self.y = input_header.x[:input_header. nb_nodes_2d], input_header.y[: input_header . nb_nodes_2d] self.ikle = input_header.ikle_2d - 1 # back to 0-based indexing self.triangles = {} self.nb_points = self.x.shape[0] self.nb_triangles = self.ikle.shape[0] self.points = np.stack([self.x, self.y], axis=1) if not construct_index: self.index = Index() else: self._construct_index(iter_pbar) def _construct_index(self, iter_pbar): """! Separate the index construction from the constructor, allowing a GUI override @param iter_pbar: iterable progress bar """ self.index = Index() for i, j, k in iter_pbar(self.ikle, unit='elements'): t = Polygon([self.points[i], self.points[j], self.points[k]]) self.triangles[i, j, k] = t self.index.insert(i, t.bounds, obj=(i, j, k)) def get_intersecting_elements(self, bounding_box): """! @brief Return the triangles in the mesh intersecting the bounding box @param bounding_box <tuple>: (left, bottom, right, top) of a 2d geometrical object @return <[tuple]>: The list of triangles (i,j,k) intersecting the bounding box """ return list(self.index.intersection(bounding_box, objects='raw'))
def local_search(points, bounding_box, iterations): labeled_points = [p for p in points if p.text] items = [] items.extend([p.label for p in labeled_points]) items.extend(points) items.extend(bounding_box.border_config) idx = Index() for i, item in enumerate(items): item.index = i idx.insert(item.index, item.box) for i in range(iterations): for lp in labeled_points: best_candidate = None min_penalty = None for lc1 in lp.label_candidates: penalty = POSITION_WEIGHT * lc1.position # Check overlap with other labels and points intersecting_item_ids = idx.intersection(lc1.box) for item_id in intersecting_item_ids: item = items[item_id] if hasattr(item, "point") and lc1.point == item.point: continue penalty += item.overlap(lc1) if min_penalty is None or penalty < min_penalty: min_penalty = penalty best_candidate = lc1 # Remove the old label from the index idx.delete(lp.label.index, lp.label.box) # Select the new label best_candidate.select() # Add the new label to the index and item list idx.insert(len(items), lp.label.box) items.append(lp.label)
def build_cache(self): label_candidates = [] for p in self.points: label_candidates.extend(p.label_candidates) items = [] items.extend(label_candidates) items.extend(self.points) items.extend(self.bounding_box.border_config) idx = Index() for i, item in enumerate(items): item.index = i idx.insert(i, item.box) for lc in label_candidates: lc.penalty = POSITION_WEIGHT * lc.position lc.label_penalties = [0 for i in range(len(label_candidates))] intersecting_item_ids = idx.intersection(lc.box) bbox_counted = False for item_id in intersecting_item_ids: item = items[item_id] if item == lc or item == lc.point: continue if isinstance(item, Label): if lc.point == item.point: continue else: lc.label_penalties[item.index] = item.overlap(lc) continue if isinstance(item, BoundingBoxBorder): if bbox_counted: continue bbox_counted = True lc.penalty += item.overlap(lc)
class Domain(object): ''' A class used to facilitate computational geometry opperations on a domain defined by a closed collection of simplices (e.g., line segments or triangular facets). This class can optionally also make use of an R-tree which can substantially reduce the computational complexity of some operations. Parameters ---------- vertices : (n, d) float array The vertices making up the domain simplices : (m, d) int array The connectivity of the vertices ''' def __init__(self, vertices, simplices): vertices = np.asarray(vertices, dtype=float) simplices = np.asarray(simplices, dtype=int) assert_shape(vertices, (None, None), 'vertices') dim = vertices.shape[1] assert_shape(simplices, (None, dim), 'simplices') self.vertices = vertices self.simplices = simplices self.dim = dim self.rtree = None self.normals = geo.simplex_normals(vertices, simplices) def __repr__(self): return ('<Domain : ' 'vertex count=%s, ' 'simplex count=%s, ' 'using R-tree=%s>' % (self.vertices.shape[0], self.simplices.shape[0], self.rtree is not None)) def __getstate__(self): # Define how pickling behaves for this class. The __getstate__ # and __setstate__ methods are required because `rtree` does # not properly pickle. So we instead save a flag indicating # whether we need to rebuild `rtree` upon unpickling. # create a shallow copy of the instances dict so that we do # not mess with its attributes state = dict(self.__dict__) rtree = state.pop('rtree') if rtree is None: state['has_rtree'] = False else: logger.debug( 'the R-tree cannot be pickled and it will be rebuilt ' 'upon unpickling') state['has_rtree'] = True return state def __setstate__(self, state): has_rtree = state.pop('has_rtree') self.__dict__ = state self.rtree = None if has_rtree: self.build_rtree() def build_rtree(self): ''' Construct an R-tree for the domain. This may reduce the computational complexity of the methods `intersection_count`, `contains`, `orient_simplices`, and `snap`. ''' # create a bounding box for each simplex and add those # bounding boxes to the R-tree if self.rtree is not None: # do nothing because the R-tree already exists logger.debug('R-tree already exists') return smp_min = self.vertices[self.simplices].min(axis=1) smp_max = self.vertices[self.simplices].max(axis=1) bounds = np.hstack((smp_min, smp_max)) p = Property() p.dimension = self.dim self.rtree = Index(properties=p) for i, bnd in enumerate(bounds): self.rtree.add(i, bnd) def orient_simplices(self): ''' Orient the simplices so that the normal vectors point outward. ''' # length scale of the domain scale = self.vertices.ptp(axis=0).max() dx = 1e-10*scale # find the normal for each simplex norms = geo.simplex_normals(self.vertices, self.simplices) # find the centroid for each simplex points = np.mean(self.vertices[self.simplices], axis=1) # push points in the direction of the normals points += dx*norms # find which simplices are oriented such that their normals # point inside faces_inside = self.contains(points) # make a copy of simplices because we are modifying it in # place new_smp = np.array(self.simplices, copy=True) # flip the order of the simplices that are backwards flip_smp = new_smp[faces_inside] flip_smp[:, [0, 1]] = flip_smp[:, [1, 0]] new_smp[faces_inside] = flip_smp self.simplices = new_smp # remake the normal vectors with the reoriented simplices self.normals = geo.simplex_normals(self.vertices, new_smp) def intersection_count(self, start_points, end_points): ''' Counts the number times the line segments intersect the boundary. Parameters ---------- start_points, end_points : (n, d) float array The ends of the line segments Returns ------- (n,) int array The number of boundary intersection ''' start_points = np.asarray(start_points, dtype=float) end_points = np.asarray(end_points, dtype=float) assert_shape(start_points, (None, self.dim), 'start_points') assert_shape(end_points, start_points.shape, 'end_points') n = start_points.shape[0] if self.rtree is None: return geo.intersection_count( start_points, end_points, self.vertices, self.simplices) else: out = np.zeros(n, dtype=int) # get the bounding boxes around each segment bounds = np.hstack((np.minimum(start_points, end_points), np.maximum(start_points, end_points))) for i, bnd in enumerate(bounds): # get a list of simplices which could potentially be # intersected by segment i potential_smpid = list(self.rtree.intersection(bnd)) if not potential_smpid: # if the segment bounding box does not intersect # and simplex bounding boxes, then there is no # intersection continue out[[i]] = geo.intersection_count( start_points[[i]], end_points[[i]], self.vertices, self.simplices[potential_smpid]) return out def intersection_point(self, start_points, end_points): ''' Finds the point on the boundary intersected by the line segments. A `ValueError` is raised if no intersection is found. Parameters ---------- start_points, end_points : (n, d) float array The ends of the line segments Returns ------- (n, d) float array The intersection point (n,) int array The simplex containing the intersection point ''' # dont bother using the tree for this one return geo.intersection_point( start_points, end_points, self.vertices, self.simplices) def contains(self, points): ''' Identifies whether the points are within the domain Parameters ---------- points : (n, d) float array Returns ------- (n,) bool array ''' points = np.asarray(points, dtype=float) assert_shape(points, (None, self.dim), 'points') # to find out if the points are inside the domain, we create # another set of points which are definitively outside the # domain, and then we count the number of boundary # intersections between `points` and the new points. # get the min value and width of the domain along axis 0 xwidth = self.vertices[:, 0].ptp() xmin = self.vertices[:, 0].min() # the outside points are directly to the left of `points` plus # a small random perturbation. The subsequent bounding boxes # are going to be very narrow, meaning that the R-tree will # efficiently winnow down the potential intersecting # simplices. outside_points = np.array(points, copy=True) outside_points[:, 0] = xmin - xwidth outside_points += np.random.uniform( -0.001*xwidth, 0.001*xwidth, points.shape) count = self.intersection_count(points, outside_points) # If the segment intersects the boundary an odd number of # times, then the point is inside the domain, otherwise it is # outside out = np.array(count % 2, dtype=bool) return out def snap(self, points, delta=0.5): ''' Snaps `points` to the nearest points on the boundary if they are sufficiently close to the boundary. A point is sufficiently close if the distance to the boundary is less than `delta` times the distance to its nearest neighbor. Parameters ---------- points : (n, d) float array delta : float, optional Returns ------- (n, d) float array The new points after snapping to the boundary (n,) int array The simplex that the points are snapped to. If a point is not snapped to the boundary then its corresponding value will be -1. ''' points = np.asarray(points, dtype=float) assert_shape(points, (None, self.dim), 'points') n = points.shape[0] out_smpid = np.full(n, -1, dtype=int) out_points = np.array(points, copy=True) nbr_dist = KDTree(points).query(points, 2)[0][:, 1] snap_dist = delta*nbr_dist if self.rtree is None: nrst_pnt, nrst_smpid = geo.nearest_point( points, self.vertices, self.simplices) nrst_dist = np.linalg.norm(nrst_pnt - points, axis=1) snap = nrst_dist < snap_dist out_points[snap] = nrst_pnt[snap] out_smpid[snap] = nrst_smpid[snap] else: # creating bounding boxes around the snapping regions for # each point bounds = np.hstack((points - snap_dist[:, None], points + snap_dist[:, None])) for i, bnd in enumerate(bounds): # get a list of simplices which node i could # potentially snap to potential_smpid = list(self.rtree.intersection(bnd)) # sort the list to ensure consistent output potential_smpid.sort() if not potential_smpid: # no simplices are within the snapping distance continue # get the nearest point to the potential simplices and # the simplex containing the nearest point nrst_pnt, nrst_smpid = geo.nearest_point( points[[i]], self.vertices, self.simplices[potential_smpid]) nrst_dist = np.linalg.norm(points[i] - nrst_pnt[0]) # if the nearest point is within the snapping distance # then snap if nrst_dist < snap_dist[i]: out_points[i] = nrst_pnt[0] out_smpid[i] = potential_smpid[nrst_smpid[0]] return out_points, out_smpid
for polygon in polygons: index.insert(count, polygon.bounds) count += 1 # recursively loop over every directory for root, directories, filenames in os.walk('root'): for filename in filenames: obj = None with open(os.path.join(root, filename), 'r') as f: bb = f.readline() tpl = eval(bb) r = Rect(*tpl) # point = Point(*r.centre_point) records = [] # for j in index.nearest(r.rtree_bb(), 1): for j in index.intersection(r.rtree_bb()): shapefile = shapefile_records[j] records.append(shapefile) if len(records) == 1: super_group = records[0]['properties']['SPRGRP'] group = records[0]['properties']['GRP'] sub_group = records[0]['properties']['SUBGRP'] region = records[0]['properties']['SUB_REGION'] elif len(records) > 1: # mode super_groups, groups, sub_groups, regions = [], [], [], [] for record in records: super_groups.append(record['properties']['SPRGRP']) groups.append(record['properties']['GRP']) sub_groups.append(record['properties']['SUBGRP'])
class RectIndex(object): """A R-tree that stores all tracks on a layer.""" def __init__(self, resolution, basename=None, overwrite=False): # type: (float) -> None self._res = resolution self._cnt = 0 if basename is None: self._index = Index(interleaved=True) else: p = Property(overwrite=overwrite) self._index = Index(basename, interleaved=True, properties=p) @property def bound_box(self): # type: () -> BBox xl, yb, xr, yt = self._index.bounds return BBox(int(xl), int(yb), int(xr), int(yt), self._res, unit_mode=True) def close(self): self._index.close() def record_box(self, box, dx, dy): # type: (BBox, int, int) -> None """Record the given BBox.""" sp_box = box.expand(dx=dx, dy=dy, unit_mode=True) bnds = sp_box.get_bounds(unit_mode=True) obj = (box.left_unit, box.bottom_unit, box.right_unit, box.top_unit, dx, dy) self._index.insert(self._cnt, bnds, obj=obj) self._cnt += 1 def rect_iter(self): # type: () -> Generator[Tuple[BBox, int, int], None, None] for xl, yb, xr, yt, sdx, sdy in self._index.intersection( self._index.bounds, objects='raw'): box_real = BBox(xl, yb, xr, yt, self._res, unit_mode=True) yield box_real, sdx, sdy def intersection_iter(self, box, dx=0, dy=0): # type: (BBox, int, int) -> Generator[BBox, None, None] """Finds all bounding box that intersects the given box.""" res = self._res test_box = box.expand(dx=dx, dy=dy, unit_mode=True) box_iter = self._index.intersection( test_box.get_bounds(unit_mode=True), objects='raw') for xl, yb, xr, yt, sdx, sdy in box_iter: box_real = BBox(xl, yb, xr, yt, res, unit_mode=True) box_sp = box_real.expand(dx=sdx, dy=sdy, unit_mode=True) if box_sp.overlaps(box) or test_box.overlaps(box_real): yield box_real.expand(dx=max(dx, sdx), dy=max(dy, sdy), unit_mode=True) def intersection_rect_iter(self, box): # type: (BBox) -> Generator[BBox, None, None] """Finds all bounding box that intersects the given box.""" res = self._res box_iter = self._index.intersection(box.get_bounds(unit_mode=True), objects='raw') for xl, yb, xr, yt, sdx, sdy in box_iter: yield BBox(xl, yb, xr, yt, res, unit_mode=True)
class DyClee: """ Implementation roughly as per https://doi.org/10.1016/j.patcog.2019.05.024. """ def __init__(self, context: DyCleeContext): self.context = context self.dense_µclusters: Set[MicroCluster] = Set() self.semidense_µclusters: Set[MicroCluster] = Set() self.outlier_µclusters: Set[MicroCluster] = Set() self.long_term_memory: Set[MicroCluster] = Set() self.eliminated: Set[MicroCluster] = Set() self.next_µcluster_index: int = 0 self.next_class_label: int = 0 self.n_steps: int = 0 self.last_partitioning_step: int = 0 self.last_density_step: int = 0 if self.context.maintain_rtree: p = RTreeProperty(dimension=self.context.n_features) self.rtree = RTreeIndex(properties=p) # This mapping is used to retrieve microcluster objects from their hashes # stored with their locations in the R*-tree self.µcluster_map: Optional[dict[int, MicroCluster]] = {} else: self.rtree = None self.µcluster_map = None @property def active_µclusters(self) -> Set[MicroCluster]: return self.dense_µclusters | self.semidense_µclusters @property def all_µclusters(self) -> Set[MicroCluster]: return self.active_µclusters | self.outlier_µclusters | self.long_term_memory def get_next_µcluster_index(self) -> int: index = self.next_µcluster_index self.next_µcluster_index += 1 return index def get_next_class_label(self) -> int: label = self.next_class_label self.next_class_label += 1 return label def update_density_partitions(self, time: Timestamp) -> Set[MicroCluster]: densities = np.array( [µcluster.density(time) for µcluster in self.all_µclusters]) mean_density = np.mean(densities) median_density = np.median(densities) dense: Set[MicroCluster] = Set() semidense: Set[MicroCluster] = Set() outliers: Set[MicroCluster] = Set() memory: Set[MicroCluster] = Set() eliminated: Set[MicroCluster] = Set() for µcluster in self.all_µclusters: density = µcluster.density(time) if mean_density <= density >= median_density: # Any may become dense dense.add(µcluster) µcluster.once_dense = True elif (µcluster in self.dense_µclusters or µcluster in self.semidense_µclusters or µcluster in self.outlier_µclusters) and ( density >= mean_density) != (density >= median_density): # Dense and outliers may become dense # Semi-dense may stay semi-dense semidense.add(µcluster) elif ((µcluster in self.dense_µclusters or µcluster in self.semidense_µclusters) and mean_density > density < median_density) or ( µcluster in self.outlier_µclusters and density >= self.context.elimination_threshold): # Dense and semi-dense may become outliers # Outliers may stay outliers outliers.add(µcluster) elif (self.context.long_term_memory and µcluster in self.outlier_µclusters and µcluster.once_dense): # Outliers may be put into long-term memory memory.add(µcluster) else: # If none of the conditions are met, the microcluster is eliminated eliminated.add(µcluster) if self.context.maintain_rtree: # Remove microcluster from R*-tree self.rtree.delete(hash(µcluster), µcluster.bounding_box) # Store the final sets, sorting by index for predictable ordering self.dense_µclusters = Set(sorted(dense, key=lambda µ: µ.index)) self.semidense_µclusters = Set(sorted(semidense, key=lambda µ: µ.index)) self.outlier_µclusters = Set(sorted(outliers, key=lambda µ: µ.index)) self.long_term_memory = Set(sorted(memory, key=lambda µ: µ.index)) if self.context.store_elements: # Keep track of eliminated microclusters (to not lose elements) self.eliminated |= eliminated return eliminated def distance_step(self, element: Element, time: Timestamp) -> MicroCluster: if self.context.update_ranges: self.context.update_feature_ranges(element) if not self.all_µclusters: # Create new microcluster µcluster = MicroCluster(element, time, context=self.context, index=self.get_next_µcluster_index()) self.outlier_µclusters.add(µcluster) if self.context.maintain_rtree: # Add microcluster to R*-tree self.µcluster_map[hash(µcluster)] = µcluster self.rtree.insert(hash(µcluster), µcluster.bounding_box) return µcluster else: closest: Optional[MicroCluster] = None if self.context.distance_index == SpatialIndexMethod.RTREE: # The R*-tree searches all microclusters regardless of precedence, so we # need to filter by priority after the index search # Find all reachable microclusters matches: Set[MicroCluster] = Set([ self.µcluster_map[hash_] for hash_ in self.rtree.intersection((*element, *element)) ]) min_dist = None for candidate_µclusters in (self.active_µclusters, self.outlier_µclusters, self.long_term_memory): # First match active microclusters, then others for µcluster in matches & candidate_µclusters: dist = µcluster.distance(element) if (closest is None or dist < min_dist or (dist == min_dist and µcluster.density(time) > closest.density(time))): closest = µcluster min_dist = dist else: for candidate_µclusters in (self.active_µclusters, self.outlier_µclusters, self.long_term_memory): # First search actives, then others for reachable microclusters if not candidate_µclusters: continue if self.context.distance_index == SpatialIndexMethod.KDTREE: # Ensure predictable order for indexability candidate_µclusters = list(candidate_µclusters) candidate_centroids: np.ndarray = np.row_stack([ µcluster.centroid for µcluster in candidate_µclusters ]) # Find potentially reachable microclusters (using L-inf norm) idcs, = KDTree( candidate_centroids, p=np.inf).query_radius( np.reshape(element, (1, -1)), self.context.potentially_reachable_radius) if not len(idcs): continue min_dist = None # Find closest (L-1 norm) microcluster among the reachable ones for i in idcs: µcluster = candidate_µclusters[i] if not µcluster.is_reachable(element): continue dist = µcluster.distance(element) # Higher density is tie-breaker in case of equal distances if (closest is None or dist < min_dist or (dist == min_dist and µcluster.density(time) > closest.density(time))): closest = µcluster min_dist = dist else: # Brute force min_dist = None for µcluster in candidate_µclusters: if not µcluster.is_reachable(element): continue dist = µcluster.distance(element) if (closest is None or dist < min_dist or (dist == min_dist and µcluster.density(time) > closest.density(time))): closest = µcluster min_dist = dist if closest is not None: # Match found, no need to check next set break if closest is not None: if self.context.maintain_rtree: # Remove microcluster from R*-tree self.rtree.delete(hash(closest), closest.bounding_box) # Add element to closest microcluster closest.add(element, time) if self.context.maintain_rtree: # Add modified microcluster to R*-tree self.rtree.insert(hash(closest), closest.bounding_box) return closest else: # Create new microcluster µcluster = MicroCluster(element, time, context=self.context, index=self.get_next_µcluster_index()) self.outlier_µclusters.add(µcluster) if self.context.maintain_rtree: # Add microcluster to R*-tree self.µcluster_map[hash(µcluster)] = µcluster self.rtree.insert(hash(µcluster), µcluster.bounding_box) return µcluster def global_density_step( self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]: clusters: list[Cluster] = [] seen: Set[MicroCluster] = Set() for µcluster in self.dense_µclusters: if µcluster in seen: continue seen.add(µcluster) if µcluster.label is None: µcluster.label = self.get_next_class_label() cluster = Cluster(µcluster, time) clusters.append(cluster) # Get dense and semi-dense directly connected neighbours connected = µcluster.get_neighbours( (self.dense_µclusters | self.semidense_µclusters) - seen, rtree_index=self.rtree, µcluster_map=self.µcluster_map) while connected: neighbour = connected.pop() if neighbour in seen: continue seen.add(neighbour) # Outlier microclusters are ignored if neighbour in self.outlier_µclusters: continue # Dense and semi-dense microclusters become part of the cluster neighbour.label = µcluster.label cluster.add(neighbour, time) # Semi-dense neighbours may only form the boundary if neighbour not in self.dense_µclusters: continue # Get neighbour's dense and semi-dense directly connected neighbours # and add to set of microclusters connected to the parent connected |= neighbour.get_neighbours( (self.dense_µclusters | self.semidense_µclusters) - seen, rtree_index=self.rtree, µcluster_map=self.µcluster_map) # Find all microclusters that were not grouped into a cluster unclustered = self.all_µclusters for cluster in clusters: unclustered -= cluster.µclusters return clusters, unclustered def local_density_step( self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]: raise NotImplementedError("TODO") def density_step( self, time: Timestamp) -> tuple[list[Cluster], Set[MicroCluster]]: if self.context.multi_density: return self.local_density_step(time) else: return self.global_density_step(time) def step( self, element: Element, time: Timestamp, skip_density_step: bool = False ) -> tuple[MicroCluster, Optional[list[Cluster]], Optional[Set[MicroCluster]], Optional[Set[MicroCluster]]]: self.n_steps += 1 µcluster = self.distance_step(element, time) if (self.n_steps >= self.last_partitioning_step + self.context.partitioning_interval): eliminated = self.update_density_partitions(time) self.last_partitioning_step = self.n_steps else: eliminated = None if (not skip_density_step and self.n_steps >= self.last_density_step + self.context.density_interval): clusters, unclustered = self.density_step(time) self.last_density_step = self.n_steps else: clusters = None unclustered = None return µcluster, clusters, unclustered, eliminated def run(self, elements: Iterable[Element], times: Optional[Iterable[Timestamp]] = None, progress: bool = True) -> list[Cluster]: if progress and tqdm is not None: elements = tqdm(elements) if times is None: times = count() for element, time in zip(elements, times): self.step(element, time, skip_density_step=True) clusters, _ = self.density_step(time) return clusters
class AdjacencyVersion(object): def __init__(self, feature_mapper): #self.partitions_complete = partitions_complete self.cid = 0 self.disc_idxs = {} self.feature_mapper = feature_mapper self.radius = .15 self.metric = 'hamming' self._rtree = None # internal datastructure self._ndim = None self.clusters = [] self.id2c = dict() self.c2id = dict() def to_json(self): data = { 'clusters': [c and c.__dict__ or None for c in self.clusters], 'id2c': [(key, c.__dict__) for key, c in self.id2c.items()], 'c2id': [(c.__dict__, val) for c, val in self.c2id.items()], 'cid': self.cid, '_ndim': self._ndim, '_rtreename': 'BLAH' } return json.dumps(data) def from_json(self, encoded): data = json.loads(encoded) self.clusters = [ c and Cluster.from_dict(c) or None for c in data['clusters'] ] self.id2c = dict([(key, Cluster.from_dict(val)) for key, val in data['id2c']]) self.c2id = dict([(Cluster.from_dict(key), val) for key, val in data['c2id']]) self.cid = data['cid'] self._ndim = data['_ndim'] self._rtree = None def setup_rtree(self, ndim, clusters=None): if self._rtree: return self._rtree self._ndim = ndim if not ndim: class k(object): def __init__(self, graph): self.graph = graph def insert(self, *args, **kwargs): pass def delete(self, *args, **kwargs): pass def intersection(self, *args, **kwargs): return xrange(len(self.graph.clusters)) self._rtree = k(self) return self._rtree p = RProp() p.dimension = max(2, ndim) p.dat_extension = 'data' p.idx_extension = 'index' if clusters: gen_func = ((i, self.bbox_rtree(c, enlarge=0.005), None) for i, c in enumerate(clusters)) self._rtree = RTree(gen_func, properties=p) else: self._rtree = RTree(properties=p) return self._rtree def bbox_rtree(self, cluster, enlarge=0.): cols = cluster.cols bbox = cluster.bbox lower, higher = map(list, bbox) if self._ndim == 1: lower.append(0) higher.append(1) if enlarge != 0: for idx, col in enumerate(cols): rng = enlarge * self.feature_mapper.ranges[col] lower[idx] -= rng higher[idx] += rng bbox = lower + higher return bbox def insert_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.insert(idx, self.bbox_rtree(cluster)) return cluster def remove_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.delete(idx, self.bbox_rtree(cluster)) return cluster def search_rtree(self, cluster): self.setup_rtree(len(cluster.bbox[0])) bbox = self.bbox_rtree(cluster, enlarge=0.01) return self._rtree.intersection(bbox) res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)] return filter(bool, res) def bulk_init(self, clusters): if not clusters: return self.setup_rtree(len(clusters[0].bbox[0]), clusters) self.clusters = clusters for cid, c in enumerate(clusters): self.id2c[cid] = c self.c2id[c] = cid for dim in self.feature_mapper.attrs: Xs = [] for cidx, c in enumerate(clusters): Xs.append(self.feature_mapper(c, dim)) idx = NearestNeighbors(radius=self.radius, algorithm='ball_tree', metric=self.metric) self.disc_idxs[dim] = idx self.disc_idxs[dim].fit(np.array(Xs)) def contains(self, cluster): return cluster in self.c2id def remove(self, cluster): if cluster in self.c2id: cid = self.c2id[cluster] self.remove_rtree(cid, cluster) del self.c2id[cluster] del self.id2c[cid] self.clusters[cid] = None return True return False def neighbors(self, cluster): ret = None for name, vals in cluster.discretes.iteritems(): if name not in self.disc_idxs: return [] vect = self.feature_mapper(cluster, name) index = self.disc_idxs[name] dists, idxs = index.radius_neighbors(vect, radius=self.radius) idxs = set(idxs[0].tolist()) if ret is None: ret = idxs else: ret.intersection_update(idxs) #ret.update(idxs) if not ret: return [] idxs = self.search_rtree(cluster) if ret is None: ret = set(idxs) else: ret.intersection_update(set(idxs)) return filter(bool, [self.clusters[idx] for idx in ret]) """
def parse_mrt(database: SqliteUtil, path: str, src_epsg: int, prj_epsg: int, bounds:int = 30, steps: int = 96): log.info('Allocating tables for MRT temperature profiles.') create_tables(database) log.info('Loading network nodes from database.') nodes: Dict[str,Node] nodes = load_nodes(database) log.info('Loading network links from database.') links: Dict[str,Link] links= load_links(database, nodes) log.info(f'Searching for mrt files in {path}') csvfiles = iter(glob(f'{path}/**/*.csv', recursive=True)) log.info('Handling initial dataset for profile construction.') points: List[Point] time: int points, time = parse_points(next(csvfiles), src_epsg, prj_epsg) log.info('Building spatial index on MRT points.') index = Index((point.entry() for point in points)) log.info('Scanning link bounds and building profiles.') mapping: Dict[FrozenSet[int],int] = {} count = 0 empty = 0 iter_links = counter(links.values(), 'Scanning link %s.') for link in iter_links: d = link.terminal_node.x * link.source_node.y - \ link.source_node.x * link.terminal_node.y dx = link.terminal_node.x - link.source_node.x dy = link.terminal_node.y - link.source_node.y l = sqrt(dy * dy + dx * dx) nearby = index.intersection(link.bounds(bounds)) contained = [] for uuid in nearby: point = points[uuid] x = point.x y = point.y if l > 0: dist = abs(dy * x - dx * y + d ) / l else: px = point.x - link.source_node.x py = point.y - link.source_node.y dist = sqrt(px * px + py * py) if dist <= bounds: contained.append(point.id) if contained: profile = frozenset(contained) if profile in mapping: link.profile = mapping[profile] else: mapping[profile] = count link.profile = count count += 1 else: empty += 1 profiles: List[Tuple[int]] profiles = [tuple(key) for key in mapping.keys()] if empty: log.warning(f'Found {empty} links without any MRT temperature profile.') def dump_points(): idx = time // (86400 // steps) for uuid, profile in enumerate(profiles): mrt, pet, utci = 0, 0, 0 count = len(profile) for ptid in profile: point = points[ptid] mrt += point.mrt pet += point.pet utci += point.utci yield (uuid, idx, time, mrt / count, pet / count, utci / count) def dump_links(): for link in links.values(): yield (link.id, link.profile) log.info('Writing link updates and temperatures to dataabse.') database.insert_values('mrt_temperatures', dump_points(), 6) database.insert_values('temp_links', dump_links(), 2) log.info('Merging, dropping and renaming old tables.') query = ''' CREATE INDEX temp_links_link ON temp_links(link_id); ''' database.cursor.execute(query) query = ''' CREATE TABLE temp_links_merged AS SELECT links.link_id, links.source_node, links.terminal_node, links.length, links.freespeed, links.capacity, links.permlanes, links.oneway, links.modes, links.air_temperature, temp_links.mrt_temperature FROM links INNER JOIN temp_links USING(link_id); ''' database.cursor.execute(query) original = database.count_rows('links') merged = database.count_rows('temp_links_merged') if original != merged: log.error('Original links and updated links tables ' 'do not align; quiting to prevent data loss.') raise RuntimeError database.drop_table('links', 'temp_links') query = ''' ALTER TABLE temp_links_merged RENAME TO links; ''' database.cursor.execute(query) database.connection.commit() del links del nodes del index del mapping del points log.info('Handling remain temperatures with defined profile.') def dump_temperaures(time: int, temperatures: List[Tuple[float,float,float]]): idx = time // (86400 // steps) for uuid, profile in enumerate(profiles): mrt, pet, utci = 0, 0, 0 count = len(profile) for tempid in profile: temp = temperatures[tempid] mrt += temp[0] pet += temp[1] utci += temp[2] yield (uuid, idx, time, mrt / count, pet / count, utci / count) for csvfile in csvfiles: time: int temperatures: List[Tuple[float,float,float]] temperatures, time = parse_temperatures(csvfile) log.info('Writing temperature data to database.') database.insert_values('mrt_temperatures', dump_temperaures(time, temperatures), 6) database.connection.commit() log.info('Creating indexes on new/updated tables.') create_indexes(database)
def main(input_dir, output_dir): formatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]: %(message)s') handler = logging.StreamHandler(sys.stderr) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) city_names = [] rtree = RTreeIndex() cities_filename = os.path.join(tempfile.gettempdir(), 'cities.json') subprocess.check_call(['wget', 'https://raw.githubusercontent.com/mapzen/metroextractor-cities/master/cities.json', '-O', cities_filename]) all_cities = json.load(open(cities_filename)) i = 0 for k, v in all_cities['regions'].iteritems(): for city, data in v['cities'].iteritems(): bbox = data['bbox'] rtree.insert(i, (float(bbox['left']), float(bbox['bottom']), float(bbox['right']), float(bbox['top']))) city_names.append(city) i += 1 files = {name: open(os.path.join(output_dir, 'cities', '{}.geojson'.format(name)), 'w') for name in city_names} planet = open(os.path.join(output_dir, 'planet.geojson'), 'w') planet_addresses_only = open(os.path.join(output_dir, 'planet_addresses_only.json'), 'w') i = 0 seen = set() for url, canonical, venues in gen_venues(input_dir): domain = urlparse.urlsplit(url).netloc.strip('www.') for props in venues: lat = props.get('latitude') lon = props.get('longitude') props['canonical'] = canonical props['url'] = url street = props.get('street_address') name = props.get('name') planet_hash = hashlib.md5(u'|'.join((name, street, str(lat), str(lon), domain)).encode('utf-8')).digest() address_hash = hashlib.md5(u'|'.join((name, street, domain)).encode('utf-8')).digest() props['guid'] = props.get('guid', random_guid()) venue = venue_to_geojson(props) if lat is not None and lon is not None: try: lat = float(lat) lon = float(lon) except Exception: lat = None lon = None if lat is not None and lon is not None and planet_hash not in seen: cities = list(rtree.intersection((lon, lat, lon, lat))) if cities: for c in cities: f = files[city_names[c]] f.write(json.dumps(venue) + '\n') if planet_hash not in seen: planet.write(json.dumps(venue) + '\n') seen.add(planet_hash) if address_hash not in seen: planet_addresses_only.write(json.dumps(props) + '\n') seen.add(address_hash) i += 1 if i % 1000 == 0 and i > 0: logger.info('did {}'.format(i)) logger.info('Creating manifest files') manifest_files = [] for k, v in all_cities['regions'].iteritems(): for city, data in v['cities'].iteritems(): f = files[city] if f.tell() == 0: f.close() os.unlink(os.path.join(output_dir, 'cities', '{}.geojson'.format(city))) continue bbox = data['bbox'] lat = midpoint(float(bbox['top']), float(bbox['bottom'])) lon = midpoint(float(bbox['left']), float(bbox['right'])) manifest_files.append({'latitude': lat, 'longitude': lon, 'file': '{}.geojson'.format(city), 'name': city.replace('_', ', ').replace('-', ' ').title()}) manifest = {'files': manifest_files} json.dump(manifest, open(os.path.join(output_dir, 'manifest.json'), 'w')) logger.info('Done!')
class AdjacencyGraph(object): def __init__(self, clusters, partitions_complete=True): self.partitions_complete = partitions_complete self.graph = defaultdict(set) self.cid = 0 self.clusters = [] self.id2c = dict() self.c2id = dict() self._rtree = None # internal datastructure self._ndim = None self.bulk_init(clusters) def to_json(self): data = { 'clusters' : [c and c.__dict__ or None for c in self.clusters], 'id2c' : [(key, c.__dict__) for key, c in self.id2c.items()], 'c2id' : [(c.__dict__, val) for c, val in self.c2id.items()], 'graph' : [(key.__dict__, [val.__dict__ for val in vals]) for key, vals in self.graph.itemsiter()], 'cid' : self.cid, '_ndim' : self._ndim, '_rtreename' : 'BLAH' } return json.dumps(data) def from_json(self, encoded): data = json.loads(encoded) self.clusters = [c and Cluster.from_dict(c) or None for c in data['clusters']] self.id2c = dict([(key, Cluster.from_dict(val)) for key, val in data['id2c']]) self.c2id = dict([(Cluster.from_dict(key), val) for key, val in data['c2id']]) self.graph = dict([(Cluster.from_dict(key), map(Cluster.from_dict, vals)) for key, vals in data['graph']]) self.cid = data['cid'] self._ndim = data['_ndim'] self._rtree = None def setup_rtree(self, ndim, clusters=None): if self._rtree: return self._rtree self._ndim = ndim if not ndim: class k(object): def __init__(self, graph): self.graph = graph def insert(self, *args, **kwargs): pass def delete(self, *args, **kwargs): pass def intersection(self, *args, **kwargs): return xrange(len(self.graph.clusters)) self._rtree = k(self) return self._rtree p = RProp() p.dimension = max(2, ndim) p.dat_extension = 'data' p.idx_extension = 'index' if clusters: gen_func = ((i, self.bbox_rtree(c, enlarge=0.00001), None) for i, c in enumerate(clusters)) self._rtree = RTree(gen_func, properties=p) else: self._rtree = RTree(properties=p) return self._rtree def bbox_rtree(self, cluster, enlarge=0.): bbox = cluster.bbox lower, higher = map(list, bbox) if self._ndim == 1: lower.append(0) higher.append(1) if enlarge != 1.: lower = [v - enlarge for v in lower] higher = [v + enlarge for v in higher] bbox = lower + higher return bbox def insert_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.insert(idx,self.bbox_rtree(cluster)) return cluster def remove_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.delete(idx, self.bbox_rtree(cluster)) return cluster def search_rtree(self, cluster): self.setup_rtree(len(cluster.bbox[0])) bbox = self.bbox_rtree(cluster, enlarge=0.00001) res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)] return filter(bool, res) def bulk_init(self, clusters): if clusters: self.setup_rtree(len(clusters[0].bbox[0]), clusters) self.clusters.extend(clusters) for cid, c in enumerate(clusters): self.id2c[cid] = c self.c2id[c] = cid for idx, c in enumerate(clusters): for n in self.search_rtree(c): if self.c2id[n] <= idx: continue if c.discretes_contains(n) and box_completely_contained(c.bbox, n.bbox): continue if not c.adjacent(n, 0.8): continue self.graph[c].add(n) self.graph[n].add(c) def insert(self, cluster): if cluster in self.graph: return self.graph[cluster] = set() #for o in self.search_rtree(cluster): for o in self.graph.keys(): if cluster == o: continue if cluster.adjacent(o, 0.8) or (volume(intersection_box(cluster.bbox, o.bbox)) > 0 and not cluster.contains(o)): self.graph[cluster].add(o) self.graph[o].add(cluster) cid = len(self.clusters) self.clusters.append(cluster) self.id2c[cid] = cluster self.c2id[cluster] = cid self.insert_rtree(cid, cluster) def remove(self, cluster): if cluster not in self.graph: return try: for neigh in self.graph[cluster]: if not neigh == cluster: self.graph[neigh].remove(cluster) except: pdb.set_trace() del self.graph[cluster] cid = self.c2id[cluster] self.remove_rtree(cid, cluster) del self.c2id[cluster] del self.id2c[cid] self.clusters[cid] = None def neighbors(self, cluster): if not self.partitions_complete: return filter(bool, self.clusters) if cluster in self.graph: return self.graph[cluster] ret = set() intersects = self.search_rtree(cluster) for key in filter(cluster.adjacent, intersects): if box_completely_contained(key.bbox, cluster.bbox): continue ret.update(self.graph[key]) return ret
def parse_parcels(database: SqliteUtil, residence_file: str, commerce_file: str, parcel_file: str, cooling_file: str, src_epsg: int, prj_epsg: int): boundaries = {} cooling = {} parcels = [] apns = set() transformer = Transformer.from_crs(f'epsg:{src_epsg}', f'epsg:{prj_epsg}', always_xy=True, skip_equivalent=True) project = transformer.transform log.info('Allocating tables for parcels.') create_tables(database) log.info('Parsing parcel boudaries from shapefile.') parser = shapefile.Reader(parcel_file) iter_boundaries = counter(iter(parser), 'Parsing parcel boundary %s.') for parcel in iter_boundaries: if len(parcel.shape.points): apn = parcel.record['APN'] points = (project(*pt) for pt in parcel.shape.points) polygon = Polygon(points) boundaries[apn] = polygon parser.close() log.info('Loading cooling information from csv file.') with open(cooling_file, 'r') as open_file: lines = csv.reader(open_file, delimiter=',', quotechar='"') next(lines) for desc, _, cool in lines: cooling[desc] = bool(cool) log.info('Parsing residential parcels from database file.') parser = shapefile.Reader(residence_file) iter_parcels = counter(parser.iterRecords(), 'Parsing residential parcel %s.') for record in iter_parcels: apn = record['APN'] if apn in boundaries and apn not in apn: cool = True polygon = boundaries[apn] parcel = Parcel(apn, 'residential', cool, polygon) parcels.append(parcel) apns.add(apn) parser.close() log.info('Parsing comercial parcels from database file.') parser = shapefile.Reader(commerce_file) iter_parcels = counter(parser.iterRecords(), 'Parsing commercial parcel %s.') for record in iter_parcels: apn = record['APN'] if apn in boundaries and apn not in apns: desc = record['DESCRIPT'] cool = cooling[desc] polygon = boundaries[apn] parcel = Parcel(apn, 'commercial', cool, polygon) parcels.append(parcel) apns.add(apn) parser.close() log.info('Parsing extraneous parcels from shapefile.') other = set(boundaries.keys()) - apns other = counter(other, 'Parsing extraneous parcel %s.') for apn in other: polygon = boundaries[apn] parcel = Parcel(apn, 'other', True, polygon) parcels.append(parcel) def load(): for idx, parcel in enumerate(parcels): pt = parcel.polygon.centroid yield (idx, (pt.x, pt.y, pt.x, pt.y), None) log.info('Building spatial index from parcel data.') index = Index(load()) log.info('Loading network region data.') regions = load_regions(database) log.info('Scanning regions and mapping mazs to parcels.') iter_regions = counter(regions, 'Sacnning region %s.') for region in iter_regions: apn = f'maz-{region.maz}' parcel = Parcel(apn, 'default', True, region.polygon) parcel.maz = region.maz parcels.append(parcel) result = index.intersection(region.polygon.bounds) for idx in result: parcel = parcels[idx] if region.polygon.contains(parcel.polygon.centroid): if parcel.maz is not None: warning = 'Parcel %s is in both region %s and %s' \ '; the latter region will be kept.' log.warning(warning % (parcel.apn, parcel.maz, region.maz)) parcel.maz = region.maz del regions def dump(): for parcel in parcels: yield (parcel.apn, parcel.maz, parcel.kind, int(parcel.cooling), None, None, dumps(parcel.polygon.centroid), dumps(parcel.polygon)) log.info('Writing parsed parcels to database.') database.insert_values('parcels', dump(), 8) database.connection.commit() log.info('Creating indexes on new tables.') create_indexes(database)
def main(input_dir, output_dir): formatter = logging.Formatter( '%(asctime)s %(levelname)s [%(name)s]: %(message)s') handler = logging.StreamHandler(sys.stderr) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) city_names = [] rtree = RTreeIndex() cities_filename = os.path.join(tempfile.gettempdir(), 'cities.json') subprocess.check_call([ 'wget', 'https://raw.githubusercontent.com/mapzen/metroextractor-cities/master/cities.json', '-O', cities_filename ]) all_cities = json.load(open(cities_filename)) i = 0 for k, v in all_cities['regions'].iteritems(): for city, data in v['cities'].iteritems(): bbox = data['bbox'] rtree.insert(i, (float(bbox['left']), float( bbox['bottom']), float(bbox['right']), float(bbox['top']))) city_names.append(city) i += 1 files = { name: open(os.path.join(output_dir, 'cities', '{}.geojson'.format(name)), 'w') for name in city_names } planet = open(os.path.join(output_dir, 'planet.geojson'), 'w') planet_addresses_only = open( os.path.join(output_dir, 'planet_addresses_only.json'), 'w') i = 0 seen = set() for url, canonical, venues in gen_venues(input_dir): domain = urlparse.urlsplit(url).netloc.strip('www.') for props in venues: lat = props.get('latitude') lon = props.get('longitude') props['canonical'] = canonical props['url'] = url street = props.get('street_address') name = props.get('name') planet_hash = hashlib.md5(u'|'.join( (name, street, str(lat), str(lon), domain)).encode('utf-8')).digest() address_hash = hashlib.md5(u'|'.join( (name, street, domain)).encode('utf-8')).digest() props['guid'] = props.get('guid', random_guid()) venue = venue_to_geojson(props) if lat is not None and lon is not None: try: lat = float(lat) lon = float(lon) except Exception: lat = None lon = None if lat is not None and lon is not None and planet_hash not in seen: cities = list(rtree.intersection((lon, lat, lon, lat))) if cities: for c in cities: f = files[city_names[c]] f.write(json.dumps(venue) + '\n') if planet_hash not in seen: planet.write(json.dumps(venue) + '\n') seen.add(planet_hash) if address_hash not in seen: planet_addresses_only.write(json.dumps(props) + '\n') seen.add(address_hash) i += 1 if i % 1000 == 0 and i > 0: logger.info('did {}'.format(i)) logger.info('Creating manifest files') manifest_files = [] for k, v in all_cities['regions'].iteritems(): for city, data in v['cities'].iteritems(): f = files[city] if f.tell() == 0: f.close() os.unlink( os.path.join(output_dir, 'cities', '{}.geojson'.format(city))) continue bbox = data['bbox'] lat = midpoint(float(bbox['top']), float(bbox['bottom'])) lon = midpoint(float(bbox['left']), float(bbox['right'])) manifest_files.append({ 'latitude': lat, 'longitude': lon, 'file': '{}.geojson'.format(city), 'name': city.replace('_', ', ').replace('-', ' ').title() }) manifest = {'files': manifest_files} json.dump(manifest, open(os.path.join(output_dir, 'manifest.json'), 'w')) logger.info('Done!')
class AdjacencyVersion(object): def __init__(self, feature_mapper): #self.partitions_complete = partitions_complete self.cid = 0 self.disc_idxs = {} self.feature_mapper = feature_mapper self.radius = .15 self.metric = 'hamming' self._rtree = None # internal datastructure self._ndim = None self.clusters = [] self.id2c = dict() self.c2id = dict() def to_json(self): data = { 'clusters' : [c and c.__dict__ or None for c in self.clusters], 'id2c' : [(key, c.__dict__) for key, c in self.id2c.items()], 'c2id' : [(c.__dict__, val) for c, val in self.c2id.items()], 'cid' : self.cid, '_ndim' : self._ndim, '_rtreename' : 'BLAH' } return json.dumps(data) def from_json(self, encoded): data = json.loads(encoded) self.clusters = [c and Cluster.from_dict(c) or None for c in data['clusters']] self.id2c = dict([(key, Cluster.from_dict(val)) for key, val in data['id2c']]) self.c2id = dict([(Cluster.from_dict(key), val) for key, val in data['c2id']]) self.cid = data['cid'] self._ndim = data['_ndim'] self._rtree = None def setup_rtree(self, ndim, clusters=None): if self._rtree: return self._rtree self._ndim = ndim if not ndim: class k(object): def __init__(self, graph): self.graph = graph def insert(self, *args, **kwargs): pass def delete(self, *args, **kwargs): pass def intersection(self, *args, **kwargs): return xrange(len(self.graph.clusters)) self._rtree = k(self) return self._rtree p = RProp() p.dimension = max(2, ndim) p.dat_extension = 'data' p.idx_extension = 'index' if clusters: gen_func = ((i, self.bbox_rtree(c, enlarge=0.005), None) for i, c in enumerate(clusters)) self._rtree = RTree(gen_func, properties=p) else: self._rtree = RTree(properties=p) return self._rtree def bbox_rtree(self, cluster, enlarge=0.): cols = cluster.cols bbox = cluster.bbox lower, higher = map(list, bbox) if self._ndim == 1: lower.append(0) higher.append(1) if enlarge != 0: for idx, col in enumerate(cols): rng = enlarge * self.feature_mapper.ranges[col] lower[idx] -= rng higher[idx] += rng bbox = lower + higher return bbox def insert_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.insert(idx,self.bbox_rtree(cluster)) return cluster def remove_rtree(self, idx, cluster): self.setup_rtree(len(cluster.bbox[0])) self._rtree.delete(idx, self.bbox_rtree(cluster)) return cluster def search_rtree(self, cluster): self.setup_rtree(len(cluster.bbox[0])) bbox = self.bbox_rtree(cluster, enlarge=0.01) return self._rtree.intersection(bbox) res = [self.clusters[idx] for idx in self._rtree.intersection(bbox)] return filter(bool, res) def bulk_init(self, clusters): if not clusters: return self.setup_rtree(len(clusters[0].bbox[0]), clusters) self.clusters = clusters for cid, c in enumerate(clusters): self.id2c[cid] = c self.c2id[c] = cid for dim in self.feature_mapper.attrs: Xs = [] for cidx, c in enumerate(clusters): Xs.append(self.feature_mapper(c, dim)) idx = NearestNeighbors( radius=self.radius, algorithm='ball_tree', metric=self.metric ) self.disc_idxs[dim] = idx self.disc_idxs[dim].fit(np.array(Xs)) def contains(self, cluster): return cluster in self.c2id def remove(self, cluster): if cluster in self.c2id: cid = self.c2id[cluster] self.remove_rtree(cid, cluster) del self.c2id[cluster] del self.id2c[cid] self.clusters[cid] = None return True return False def neighbors(self, cluster): ret = None for name, vals in cluster.discretes.iteritems(): if name not in self.disc_idxs: return [] vect = self.feature_mapper(cluster, name) index = self.disc_idxs[name] dists, idxs = index.radius_neighbors(vect, radius=self.radius) idxs = set(idxs[0].tolist()) if ret is None: ret = idxs else: ret.intersection_update(idxs) #ret.update(idxs) if not ret: return [] idxs = self.search_rtree(cluster) if ret is None: ret = set(idxs) else: ret.intersection_update(set(idxs)) return filter(bool, [self.clusters[idx] for idx in ret]) """
class SpatialIndex(): """ A spatial index is a type of extended index that allows you to index a spatial column. A spatial column is a table column that contains data of a spatial data type. Spatial indexes help to improve spatial query performance on a dataframe. Identifying a feature, selecting features, and joining data all have better performace when using spatial indexing. ==================== ================================================== Arguement Description -------------------- -------------------------------------------------- stype Required String. This sets the type of spatial index being used by the user. The current types of spatial indexes are: custom, rtree and quadtree. -------------------- -------------------------------------------------- bbox Optional Tuple. The extent of the spatial data as: (xmin, ymin, xmax, ymax). This parameter is required if a QuadTree Spatial Index is being used. Example: bbox=(-100, -50, 100, 50) -------------------- -------------------------------------------------- filename Optional String. The name of the spatial index file. This is only supported by rtree spatial indexes. For large datasets an rtree index can be saved to disk and used at a later time. If this is not provided the r-tree index will be in-memory. -------------------- -------------------------------------------------- custom_index Optional Object. Sometimes QuadTree and Rtree indexing is not enough. A custom spatial index class can be giving to the SpatialIndex class and used using encapsulation. The custom index must have two methods: `intersect` that excepts a tuple, and `insert` which must accept an oid and a bounding box. This object is required when `stype` of 'custom' is specified. ==================== ================================================== """ _stype = None _bbox = None _index = None _df = None #---------------------------------------------------------------------- def __init__(self, stype, bbox=None, **kwargs): """initializer""" ci = kwargs.pop('custom_index', None) self._filename = kwargs.pop('filename', None) self._bbox = bbox self._stype = stype.lower() self._df = None if ci and stype.lower() == 'custom': self._index = ci elif stype.lower() == 'quadtree' and bbox: self._index = QIndex(bbox=bbox) elif RIndex and stype.lower() == 'rtree': self._index = RIndex(self._filename) else: raise ValueError("Could not create the spatial index.") #---------------------------------------------------------------------- def intersect(self, bbox): """ Returns the spatial features that intersect the bbox :bbox: tuple - (xmin,ymin,xmax,ymax) :returns: list """ if self._stype.lower() in ['rtree']: return list(self._index.intersection(bbox)) elif self._stype.lower() in ['quadtree']: return list(self._index.intersect(bbox=bbox)) else: return list(self._index.intersect(bbox)) #---------------------------------------------------------------------- def insert(self, oid, bbox): """ Inserts the entry into the spatial index :oid: unique id :bbox: tuple - (xmin,ymin,xmax,ymax) """ if self._index is None: raise Exception(("Could not insert into a spatial index because " "it does not exist.")) if self._stype == 'rtree' and \ HASRTREE and \ isinstance(self._index, RIndex): r = self._index.insert(id=oid, coordinates=bbox, obj=None) self.flush() return r elif self._stype.lower() == 'quadtree': return self._index.insert(item=oid, bbox=bbox) elif self._stype.lower() == 'custom': r = self._index.intersect(oid, bbox) self.flush() return r #---------------------------------------------------------------------- def flush(self): """ Saves the index to disk if a filename is given for an R-Tree Spatial Index. **This applies only to the R-Tree implementation of the spatial index.** :returns: Boolean """ if hasattr(self._index, 'flush'): getattr(self._index, 'flush')() elif self._stype == 'rtree' and \ self._filename: self._index.close() self._index = RIndex(self._filename) else: return False return True
class RTreeTest(unittest.TestCase): def Xtest_insertion(self): repeat = 10 basen = 100 boxes = [Box(i) for i in range(basen)] t = timeit.Timer(lambda: self.insert_boxes(boxes), setup=lambda: self.create_index_data([])) print(t.timeit(number=repeat) / repeat) n = 100000 prior_boxes = [Box(i) for i in range(n)] boxes = [Box(i) for i in range(n, n + basen)] t = timeit.Timer( lambda: self.insert_boxes(boxes), setup=lambda: self.create_index_data(prior_boxes), ) print(t.timeit(number=repeat) / repeat) def Xtest_creation(self): repeat = 10 basen = 100 boxes = [Box(i) for i in range(basen)] t = timeit.Timer(lambda: self.create_index_data(boxes)) t0 = t.timeit(number=repeat) / repeat print(basen, t0) for i in range(6): m = 2**(i + 1) n = m * basen boxes = [Box(i) for i in range(n)] t = timeit.Timer(lambda: self.create_index_data(boxes)) t1 = t.timeit(number=repeat) / repeat print(n, m, t1, t1 / t0) def Xtest_stream(self): repeat = 10 n = 10000 boxes = [] for i in range(n): boxes.append(Box(i)) def box_generator(): for b in boxes: yield (b.index, b.box, b.index) t = timeit.Timer(lambda: self.create_index_data(boxes)) print(t.timeit(number=repeat) / repeat) t = timeit.Timer(lambda: self.create_index_stream(box_generator())) print(t.timeit(number=repeat) / repeat) def Xtest_query(self): repeat = 10 boxes = [Box(i) for i in range(100)] self.create_index_data(boxes) test_boxes = random.sample(boxes, 10) t = timeit.Timer(lambda: self.query_index(test_boxes)) print(t.timeit(number=repeat) / repeat) boxes = [Box(i) for i in range(100000)] self.create_index_data(boxes) test_boxes = random.sample(boxes, 10) t = timeit.Timer(lambda: self.query_index(test_boxes)) print(t.timeit(number=repeat) / repeat) def insert_boxes(self, boxes): for b in boxes: self.idx.insert(b.index, b.box) def create_index_data(self, data): self.idx = Index() for d in data: self.idx.insert(d.index, d.box, d.index) def create_index_stream(self, generator): self.idx = Index(generator) def query_index(self, boxes): for b in boxes: overlapping_boxes = self.idx.intersection(b.box)