def processEntityFilledInToObj(name, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(name, FileType.XYZ_FILLED_IN, FileType.OBJ, forceOverwrite) log("Processing {}".format(filepath_in)) df = pd.read_csv(filepath_in) log("Shape {}".format(df.shape)) xs = np.sort(df['X'].unique()) ys = np.sort(df['Y'].unique()) next_bound_x = get_next_bounds(xs) next_bound_y = get_next_bounds(ys) log('Setting geometry column') if WITH_NAIVE_Y_SCALE: df['geometry'] = [ Polygon( get_rect_verts(row['X'], next_bound_x[row['X']], row['Y'] * UK_HEIGHT / UK_WIDTH, next_bound_y[row['Y']] * UK_HEIGHT / UK_WIDTH)) for i, row in tqdm(df.iterrows()) ] else: df['geometry'] = [ Polygon( get_rect_verts(row['X'], next_bound_x[row['X']], row['Y'], next_bound_y[row['Y']])) for i, row in tqdm(df.iterrows()) ] log('Dataframe to obj') dataframe_to_obj_2(scale_z(df, Z_SCALE), filepath_out, material_name=MATERIAL_NAME)
def processMSOAEntityCommonToGeoDF(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.MSOA_COMMON_FORM, FileType.MSOA_GEO_DF, forceOverwrite) # Could convert these to be lazy loaded? print('Read 1') boundariesDf = gpd.read_file( "data/MSOAFiles/MSOA_2011_EW_BFC_shp/MSOA_2011_EW_BFC.shp") boundariesDf['geometry'] = boundariesDf['geometry'].to_crs('EPSG:4326') print('Read 2') populationPerMsoa = pd.read_csv('data/MSOAFiles/popPerMSOA.csv') populationPerMsoa['population'] = pd.to_numeric( populationPerMsoa['All Ages'].str.strip().str.replace(',', '')) print('Read 3') df = pd.read_csv(filepath_in) print('Merging') df = df.merge(boundariesDf, left_on='MSOA code', right_on='MSOA11CD', how='left') df = df.merge(populationPerMsoa, on='MSOA code', how='left') print('saving') pds_poly = gpd.GeoDataFrame(df) pds_poly[['Z', 'geometry', 'population']].to_csv(filepath_out, index=False)
def processMSOAEntityToHeatMap(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.MSOA_SIMPLIFIED, FileType.HEATMAP, forceOverwrite) df = pd.read_csv(filepath_in) df['geometry'] = df['geometry'].apply(shapely.wkt.loads) log("Plotting") plot_uk(df) plt.axis("off") log("Saving figure") plt.savefig(filepath_out, bbox_inches='tight')
def processEntityPointToDistance(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.POINT, FileType.XYZ, forceOverwrite) [folder, filename] = os.path.split(filepath_out) filepath_out_inverted = os.path.join(folder, "inverted_" + filename) points = json.load(open(filepath_in, 'r')) lng_lat_points = [(float(point['lng']), float(point['lat'])) for point in points] lng_lat_points = [x for x in set(lng_lat_points)] example_filename = os.path.join(FILE_LOCATIONS[FileType.XYZ], 'gbr_pd_2020_1km_UNadj_ASCII_XYZ.csv') xyz_example = csv.reader(open(example_filename)) xyz_example = [x for x in xyz_example ] # make it a list so we have total count for tqdm xyz_example = xyz_example[1:] # skip the headers xyz_result = [] max_z = 0 log('calculating distance for each pixel') for x, y, z in tqdm(xyz_example): min_sq_distance = math.inf x = float(x) y = float(y) dx = 0 dy = 0 for lng, lat in lng_lat_points: # This could be made much much faster but hey its quick enough for now - takes like 3mins for high def dx = x - lng dy = y - lat distance_sq = dx * dx + dy * dy min_sq_distance = min(min_sq_distance, distance_sq) min_distance = math.sqrt(min_sq_distance) max_z = max(max_z, min_distance) xyz_result.append((x, y, min_distance)) log('writing result to {} and {}'.format(filepath_out, filepath_out_inverted)) if not os.path.exists(filepath_out) or forceOverwrite: out_template = "{},{},{}\n" with open(filepath_out, 'w') as out_fp: out_fp.write(out_template.format("X", "Y", "Z")) for x, y, z in tqdm(xyz_result): out_fp.write(out_template.format(x, y, z)) if not os.path.exists(filepath_out_inverted) or forceOverwrite: out_template = "{},{},{}\n" with open(filepath_out_inverted, 'w') as out_fp: out_fp.write(out_template.format("X", "Y", "Z")) for x, y, z in tqdm(xyz_result): out_fp.write(out_template.format(x, y, max_z - z))
def processEntityObjToZip(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.OBJ, FileType.COMPRESSED, forceOverwrite) with open(filepath_in, 'rb') as f_in: with gzip.open(filepath_out, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # dir = '/Users/james_hargreaves/WebstormProjects/data-visualisation/src/data/objFiles/distanceTo' # outDir = '/Users/james_hargreaves/WebstormProjects/data-visualisation/src/data/objFilesCompressed/distanceTo' # todo = [x for x in os.listdir(dir) if not x.endswith('.gz')] # # for filename in tqdm(todo): # path = os.path.join(dir, filename) # with open(path, 'rb') as f_in: # with gzip.open(os.path.join(outDir, filename)+'.gz', 'wb') as f_out: # shutil.copyfileobj(f_in, f_out)
def processEntityLowResToFilledIn(entityName, forceOverwrite=False, sourceOfTruth=SOURCE_OF_TRUTH_FILEPATH): filepath_in, filepath_out = getFilepaths(entityName, FileType.XYZ_LOW_RES, FileType.XYZ_FILLED_IN, forceOverwrite) ys = set() xs = set() xy_to_z = {} for x, y, z in tqdm(csv.reader(open(filepath_in)), position=0, leave=True): if x == 'X': # skip header row continue xs.add(x) ys.add(y) xy_to_z[(x, y)] = z xs_sorted = list(sorted(map(float, xs))) ys_sorted = list(sorted(map(float, ys))) diffs_y = [ ys_sorted[i] - ys_sorted[i - 1] for i in range(1, len(ys_sorted)) ] diffs_x = [ xs_sorted[i] - xs_sorted[i - 1] for i in range(1, len(xs_sorted)) ] # fairly certain this threshold currently does nothing TODO check and remove y_threshold = min(diffs_y) x_threshold = min(diffs_x) uk_landmass = UkLandmass(sourceOfTruth, x_threshold) count = 0 out_template = "{},{},{}\n" with open(filepath_out, 'w+') as out_fp: out_fp.write(out_template.format("X", "Y", "Z")) for y in tqdm(ys): valid_xs = uk_landmass.filter_included_all_y( xs_sorted, float(y), y_threshold, x_threshold) for x in valid_xs: z = xy_to_z.get((str(x), y), 'nothing found') if z == 'nothing found': out_fp.write(out_template.format(x, y, 0)) count += 1 else: out_fp.write(out_template.format(x, y, z)) log("0 value filled in for {} pixels".format(count))
def processMSOASimplifiedEntityInToObj(name, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(name, FileType.MSOA_SIMPLIFIED, FileType.OBJ, forceOverwrite) log("Processing {}".format(filepath_in)) df = pd.read_csv(filepath_in) if WITH_NAIVE_Y_SCALE: df['geometry'] = df['geometry'].apply( lambda p: convertPolygonToScaledXYScaledPolygon( shapely.wkt.loads(p))) else: df['geometry'] = df['geometry'].apply(lambda p: shapely.wkt.loads(p)) log("Shape {}".format(df.shape)) log('Dataframe to obj') dataframe_to_obj_2(scale_z(df, Z_SCALE), filepath_out, material_name=MATERIAL_NAME)
def processEntityXYZToHeatMap(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.XYZ_FILLED_IN, FileType.HEATMAP, forceOverwrite) # filepath_in = 'data/tmp/low_res.csv' df = pd.read_csv(filepath_in) xs = np.sort(df['X'].unique()) ys = np.sort(df['Y'].unique()) next_bound_x = get_next_bounds(xs) next_bound_y = get_next_bounds(ys) df['geometry'] = df.apply(lambda row: Polygon( get_rect_verts(row.X, next_bound_x[row.X], row.Y, next_bound_y[row.Y]) ), axis=1) log("Plotting") plot_uk(df) plt.axis('off') log("Saving figure") plt.savefig(filepath_out, bbox_inches='tight')
def processPostcodeToMeanXYZLowRes(name, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(name, FileType.POSTCODE, FileType.XYZ_LOW_RES, forceOverwrite) logInfo('Reading price paid files') df = getDataFrameFromDataFile(filepath_in) initialNumberOfRecords = df.shape[0] # throw away any rows which don't have a post code ~0.40% are thrown away for the 2020 data df = df[df['postcode'].notna()] filterNoPostcodeNumRecord = df.shape[0] logInfo('Removed {} out of {} ({}%)records due to missing postcode'.format( initialNumberOfRecords - filterNoPostcodeNumRecord, initialNumberOfRecords, round( 100 * (initialNumberOfRecords - filterNoPostcodeNumRecord) / initialNumberOfRecords, 2))) postCodeDataToAggregateXYZ(df, filepath_out, activeFieldName='pricePaid', aggregateMethod=lambda x: sum(x) / len(x))
def skipLowResToFilledIn(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.XYZ_LOW_RES, FileType.XYZ_FILLED_IN, forceOverwrite) shutil.copyfile(filepath_in, filepath_out)
out_fp.write(out_template.format("X", "Y", "Z")) for y in tqdm(ys): valid_xs = uk_landmass.filter_included_all_y( xs_sorted, float(y), y_threshold, x_threshold) for x in valid_xs: z = xy_to_z.get((str(x), y), 'nothing found') if z == 'nothing found': out_fp.write(out_template.format(x, y, 0)) count += 1 else: out_fp.write(out_template.format(x, y, z)) log("0 value filled in for {} pixels".format(count)) def processEntityLowResToFilledInEnglandAndWales(entityName, forceOverwrite=False): processEntityLowResToFilledIn(entityName, forceOverwrite, SOURCE_OF_TRUTH_ENGLAND_WALES) if __name__ == '__main__': for filename in os.listdir(FILE_LOCATIONS[FileType.XYZ_LOW_RES]): entityName = os.path.splitext(filename)[0] _, filepath_out = getFilepaths(entityName, FileType.XYZ_LOW_RES, FileType.XYZ_FILLED_IN) if os.path.exists(filepath_out) and not FORCE_OVERWRITE: continue processEntityLowResToFilledIn(entityName, FORCE_OVERWRITE)
max_z = max(max_z, min_distance) xyz_result.append((x, y, min_distance)) log('writing result to {} and {}'.format(filepath_out, filepath_out_inverted)) if not os.path.exists(filepath_out) or forceOverwrite: out_template = "{},{},{}\n" with open(filepath_out, 'w') as out_fp: out_fp.write(out_template.format("X", "Y", "Z")) for x, y, z in tqdm(xyz_result): out_fp.write(out_template.format(x, y, z)) if not os.path.exists(filepath_out_inverted) or forceOverwrite: out_template = "{},{},{}\n" with open(filepath_out_inverted, 'w') as out_fp: out_fp.write(out_template.format("X", "Y", "Z")) for x, y, z in tqdm(xyz_result): out_fp.write(out_template.format(x, y, max_z - z)) if __name__ == '__main__': for filename in os.listdir(FILE_LOCATIONS[FileType.POINT]): entityName = os.path.splitext(filename)[0] _, filepath_out = getFilepaths(entityName, FileType.POINT, FileType.XYZ, True) if os.path.exists(filepath_out) and not FORCE_OVERWRITE: continue processEntityPointToDistance(entityName, FORCE_OVERWRITE)
def processMSOAEntityMergedToSimplified(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.MSOA_MERGED, FileType.MSOA_SIMPLIFIED, forceOverwrite) print('reading and loading data structures') zs = {} polys = {} v_to_key = defaultdict(set) key_to_vs = defaultdict(list) key = 0 with open(filepath_in, 'r') as fp: reader = csv.reader(fp) next(reader) # skip headers for i, (z, p) in tqdm(enumerate(reader)): loaded = shapely.wkt.loads(p) ps = [loaded] if p.startswith('POLYGON') else list(loaded) for poly in ps: zs[key] = z poly = roundAndRemoveInvalidPoints(poly) polys[key] = poly for v in poly.exterior.coords: key_to_vs[key].append(v) v_to_key[v].add(key) key += 1 maxKey = key key = None # to get run time errors if it is read again print('calculating simplified lines between neighbours') poly_to_borders = defaultdict(list) poly_to_replaced_indecies = defaultdict(list) for k in tqdm(range(maxKey)): verts = key_to_vs[k] neighbours = set() for v in verts: neighbours = neighbours.union(v_to_key[v]) for nKey in neighbours: if nKey < k or nKey == k: continue # will be handled during other iteration commonVerts = set(verts).intersection(key_to_vs[nKey]) # need a way to detect if they are multiple lines start = None end = None prev_included = False start_i, end_i, commonVerts_ids = None, None, [] for i, v in enumerate(verts): if v in commonVerts: commonVerts_ids.append(i) if verts[(i + 1) % len(verts)] not in commonVerts: end = v end_i = i if verts[i - 1] not in commonVerts: start = v start_i = i if start_i > end_i: b_indecies = list(range(start_i, len(verts))) + list(range(end_i + 1)) boundaryPoints = verts[start_i:] + verts[:end_i + 1] else: b_indecies = list(range(start_i, end_i + 1)) boundaryPoints = verts[start_i:end_i + 1] for i in b_indecies: assert (i in commonVerts_ids) # this only works if the do not cross the start point but if they don't then I assume that the "if not # closeEnough" statement will fire. If this statement returns true this means that the points between the # start and end point are not a continuous sequence of points on the perimeter of the polygon. There are # two reasons that this can happen the first is that there is a slight difference between some non significant # bit between the two neighbouring polygons (for this case we just proceed). The second case is where # there is two edges between the two polygons this case hasn't been encountered yet and so it is not handled. if len(commonVerts_ids) != len(b_indecies): print(len(commonVerts_ids), len(b_indecies)) print([x for x in commonVerts_ids if x + 1 not in commonVerts_ids]) # check all points closer than epsilon to the existing points which are in the range and if they are just # take the start and end this method runs the risk of ruining the corner of the next polygon along, if # this happens we should just remove some of the verts so that they wont be common but I think this will # be fine smallest = min(commonVerts_ids) largest = max(commonVerts_ids) threshold = 0.01 for i in range(smallest, largest): pointx, pointy = verts[i] closeEnough = False for commonx, commony in commonVerts: dist = (pointx - commonx) ** 2 + (pointy - commony) ** 2 if dist < threshold: closeEnough = True break if not closeEnough: print(i, pointx, pointy) assert False print('Happy that close enough') start_i = smallest end_i = largest b_indecies = list(range(start_i, end_i + 1)) boundaryPoints = verts[start_i:end_i + 1] simplifiedBoundary_v, simplifyBoundary_i = simplifyLine(boundaryPoints, b_indecies) for i in range(len(simplifiedBoundary_v) - 1): start = simplifiedBoundary_v[i] start_i = simplifyBoundary_i[i] end = simplifiedBoundary_v[i + 1] end_i = simplifyBoundary_i[i + 1] mid_i = ( start_i + end_i) // 2 if start_i < end_i else 0 # mid point is needed to check if it wraps around the start point while mid_i not in commonVerts_ids: # due to an error in a non significant bit mid_i += 1 poly_to_borders[k].append((start, end, verts[mid_i])) poly_to_borders[nKey].append((start, end, verts[mid_i])) print('writing out and simplifying map edges') template = "{},\"{}\"\n" with open(filepath_out, 'w+') as fp: fp.write("Z,geometry\n") # From this point On I will assume that the start / end points are contiguous for k in tqdm(range(maxKey)): neighbourBorderPoints = poly_to_borders[k] points = key_to_vs[k] # get the indecies of the start and end points neighbourBorderPointsIndecies = defaultdict(lambda: [inf, inf, inf]) for pi, p in enumerate(points): for ni, (start, end, mid) in enumerate(neighbourBorderPoints): if p == start: neighbourBorderPointsIndecies[ni][0] = pi if p == end: neighbourBorderPointsIndecies[ni][1] = pi if p == mid: neighbourBorderPointsIndecies[ni][2] = pi for (start_i, end, mid) in neighbourBorderPointsIndecies.values(): assert (start is not inf) assert (mid is not inf) assert (end is not inf) # work out if one edge wraps around start point doesWrap = False for start, end, mid in neighbourBorderPointsIndecies.values(): smallest = min(start, end) largest = max(start, end) doesWrap |= not (smallest < mid < largest) and mid != inf onEdge = doesWrap outPoints = [] remaining_indecies = sorted( [x for x, y, z in neighbourBorderPointsIndecies.values()] + [y for x, y, z in neighbourBorderPointsIndecies.values()]) # print(k, doesWrap, neighbourBorderPointsIndecies.values()) next_index = remaining_indecies.pop(0) if remaining_indecies else inf nonEdgePoints = [] current_nonEdge = [] for pi, p in enumerate(points): if pi == next_index: if current_nonEdge: current_nonEdge.append(p) nonEdgePoints.append(current_nonEdge) current_nonEdge = [] outPoints.append(p) next_index = remaining_indecies.pop(0) if remaining_indecies else inf if pi == next_index: # ie end of one and start of another next_index = remaining_indecies.pop(0) if remaining_indecies else inf else: onEdge = not onEdge if not onEdge: current_nonEdge.append(p) continue elif not onEdge: current_nonEdge.append(p) # wrap around non edge if not onEdge and current_nonEdge: if nonEdgePoints: nonEdgePoints[0] = current_nonEdge + nonEdgePoints[0] else: nonEdgePoints.append(current_nonEdge) for nonBoundary in nonEdgePoints: simplifiedBoundary_v, simplifyBoundary_i = simplifyLine(nonBoundary, [i for i, _ in enumerate(nonBoundary)]) simplifiedLine = [] simplifiedPoints = [] for i in range(len(simplifiedBoundary_v) - 1): start = simplifiedBoundary_v[i] end = simplifiedBoundary_v[i + 1] simplifiedPoints.append(start) simplifiedLine.append((start, end)) insertIndex = outPoints.index(end) if outPoints else 0 outPoints = outPoints[:insertIndex] + simplifiedPoints + outPoints[insertIndex:] outPolygon = Polygon(outPoints) fp.write(template.format(zs[k], outPolygon)) if withPlot: print('Plotting') plot(filepath_out) print('Finished')
def ProcessMSOAEntityGeoDfToMerged(entityName, forceOverwrite=False): filepath_in, filepath_out = getFilepaths(entityName, FileType.MSOA_GEO_DF, FileType.MSOA_MERGED, forceOverwrite) print('reading and loading data structures') zs = {} key_to_pop = {} polys = {} v_to_key = defaultdict(set) key_to_vs = defaultdict(set) a = [] key = 0 ignore_keys = set() with open(filepath_in, 'r') as fp: reader = csv.reader(fp) next(reader) # skip headers for i, (z, p, pop) in tqdm(enumerate(reader)): loaded = shapely.wkt.loads(p) ps = [loaded] if p.startswith('POLYGON') else list(loaded) for poly in ps: zs[key] = float(z) key_to_pop[key] = float(pop) polys[key] = poly a.append((key, poly.area)) for v in poly.exterior.coords: key_to_vs[key].add(v) v_to_key[v].add(key) key += 1 print(sorted(a, key=lambda x: x[1])[-1]) sorted_as = sorted(a, key=lambda x: x[1]) print('merging') i = 0 # This is expensive at relatively deterministic for any z value so the result could be turned into a map and then # dramatically sped up. But once again this is an optimisation. while sorted_as[0][1] < minArea: i += 1 sorted_as = sorted(a, key=lambda x: x[1]) first = True smallestKey = 0 while smallestKey in ignore_keys or first: first = False smallestKey, smallestArea = sorted_as.pop(0) vs = key_to_vs[smallestKey] neighboursSeenOnce = set() neighboursSeenAtLeastTwice = set() for v in vs: for nKey in v_to_key[v]: if nKey in neighboursSeenOnce: neighboursSeenAtLeastTwice.add(nKey) neighboursSeenOnce.remove(nKey) elif nKey not in neighboursSeenAtLeastTwice: neighboursSeenOnce.add(nKey) neighboursSeenAtLeastTwice.remove(smallestKey) neighboursSeenAtLeastTwice -= ignore_keys if len(neighboursSeenAtLeastTwice) == 0: ignore_keys.add(smallestKey) continue smallestNeighbourKey = neighboursSeenAtLeastTwice.pop() smallestNeighbourArea = polys[smallestNeighbourKey].area for nKey in neighboursSeenAtLeastTwice: area = polys[nKey].area if smallestNeighbourArea > area: smallestNeighbourArea = area smallestNeighbourKey = nKey newPoly = cascaded_union([polys[smallestKey], polys[smallestNeighbourKey]]) polys[key] = newPoly popSmallest = key_to_pop[smallestKey] popNeighbour = key_to_pop[smallestNeighbourKey] key_to_pop[key] = popSmallest + popNeighbour zs[key] = (zs[smallestKey] * popSmallest + zs[smallestNeighbourKey] * popNeighbour) / ( popSmallest + popNeighbour) # would likely be more efficient to also move the data from the larger data structures but that is an # optimisation we can do later. a = [x for x in a if x[0] not in [smallestKey, smallestNeighbourKey]] + [(key, newPoly.area)] for v in newPoly.exterior.coords: key_to_vs[key].add(v) v_to_key[v].add(key) key += 1 ignore_keys.add(smallestKey) ignore_keys.add(smallestNeighbourKey) if i % 100 == 0: print(i, smallestArea) print('writing') template = "{},\"{}\"\n" with open(filepath_out, 'w+') as fp: fp.write("Z,geometry\n") for k in (polys.keys() - ignore_keys): fp.write(template.format(zs[k], polys[k])) # Can be useful for debugging: # with open('./data/tmp/testMergedOrder.csv','w+') as fp: # fp.write("Z,geometry\n") # for i,k in enumerate(polys.keys() - ignore_keys): # fp.write(template.format(i, polys[k])) if withPlot: print('plotting') plot(filepath_out) print('finished')