def dissolve_by_attribute(in_path, dissolve_attribute, out_path=False): '''Remove boundaries according to attribute. Dissolve boundaries for shapefile(s) according to a given attribute. we will also check for contiguity after boundaries have been dissolved. Arguments: in_path: full path to input shapefile to be dissolved out_path: full path to save created shapefile disolve_attribute: attribute to dissolve boundaries by ''' # Generate dissolved shapefile df = fm.load_shapefile(in_path) df = sm.dissolve(df, dissolve_attribute) # Print potential errors sc.check_contiguity_and_contained(df, dissolve_attribute) # Save shapefile if out_path: fm.save_shapefile(df, out_path) return df
def disaggregate_file(shp_path, disaggregate_attr, direc_path, prefix='', suffix=''): ''' Take a larger shapefile and disaggreagate it into smaller shapefiles according to an attribute. The directory and shapefile name will be prefix + disaggregate_attribute value + suffix. NOTE: direc_path SHOULD NOT END WITH '/' Example: Use to disaggregate statewide census block file to county census block files If available load in shp_path withh a pickle file rather than the actual shapefile. Loading in statewide census files takes a while Arguments: shp_path: path to shapefile to disaggregate disaggregate_attr: attribute to disaggregate on direc_path: path to directory to create subdirectory of smaller shapefiles for each unique value. prefix: string to put in front name of smaller shapefiles suffix: string to put behind name of smaller shapefiles ''' # load shapefile df = fm.load_shapefile(shp_path) # Get unique elements of each attribute attributes = set(df[disaggregate_attr]) # For each attribute create subdirectory, create smaller shapefile, and save for attr in attributes: # name of subdirectory and new shapefile name = prefix + attr + suffix subdirec = direc_path + '/' + name shp_name = name + '.shp' # create subdirectory if os.path.exists(subdirec): shutil.rmtree(subdirec) os.mkdir(subdirec) # create shapefile with the correct attributes df_attr = df[df[disaggregate_attr] == attr] df_attr = gpd.GeoDataFrame(df_attr, geometry='geometry') fm.save_shapefile(df_attr, subdirec + '/' + shp_name)
def remove_geometries(path_delete, save_path, path_reference, thresh): ''' Delete geometries from a shapefile that does not have a percent area intersetion above a inputted threshold. Arguments: path_delete: path to shapefile that we are editing (deleting shapes without enough intersection) save_path: path to save edited shapefile after geometries have been removed from the path_delete shapefile. If false, we will not save path_reference: path to shapefile we will be comparing the intersection with. Intersections will be taken with respect to the union of all of these geometries thresh: fraction threshold required to keep a shape. If thresh is 0.9 then any shape with an intersection ratio greater than or equal to 0.9 will remain and anything below will be deleted Output: edited dataframe with shapes removed ''' # Load shapefiles df_del = fm.load_shapefile(path_delete) df_ref = fm.load_shapefile(path_reference) # Get full reference poly ref_poly = shp.ops.cascaded_union(list(df_ref['geometry'])) # Get ratio for each element df_del['ratio'] = df_del['geometry'].apply( lambda x: x.intersection(ref_poly).area / x.area) # Filter out elements less than threshold df_del = df_del[df_del.ratio >= thresh] # drop ratio series df_del = df_del.drop(columns=['ratio']) # Save and return if save_path: fm.save_shapefile(df_del, save_path) return df_del
def transform_crs(shp_paths, crs='epsg:4269'): ''' Update the coordinate refernce system for a set of shapefiles Arguments: shp_paths: LIST of paths to shapefiles to be edited crs: the coordinate reference system to convert to. Default is above Output: None, but the original file will be edited and updated ''' # Iterate over all paths for path in shp_paths: # load, add crs, and save df = fm.load_shapefile(path) df = fm.set_CRS(df, crs) fm.save_shapefile(df, path)
def merge_shapefiles(paths_to_merge, out_path=False, keep_cols='all'): ''' Combine multiple shapefiles into a single shapefile Arguments: paths_to_merge: LIST of path strings of shapfiles to merge out_path: path to save new shapefile keep_cols: default -> 'all' meeans to keep all, otherwise this input takes a LIST of which columns/attributes to keep ''' # Initalize Output DatFarme df_final = pd.DataFrame() # Loop through paths and merge for path in paths_to_merge: # Load and append current dataframe df_current = fm.load_shapefile(path) df_final = df_final.append(df_current, ignore_index=True, sort=True) # reduce to only columns/attributes we are keeping if keep_cols == 'all': exclude_cols = [] else: exclude_cols = list(set(df_final.columns) - set(keep_cols)) # Save final shapefile df_final = gpd.GeoDataFrame(df_final, geometry='geometry') if out_path: fm.save_shapefile(df_final, out_path, exclude_cols) return df_final
def create_bounding_frame(in_path, out_path=False): ''' Create a bounding box around the extents of a shapefile. This will be used to overlay on top of a georeferenced image in GIS to allow for automated cropping in the algorithm that converts converting precinct images to shapefiles. Will usually use a census block shapfile to generate this bounding frame Arguments: in_path: full path to input shapefile to create bounding frame for out_path: full path to save bounding frame shapefile ''' # Generate bounding frame and save df = fm.load_shapefile(in_path) bounding_frame_df = sm.generate_bounding_frame(df) if out_path: fm.save_shapefile(bounding_frame_df, out_path) return df
def distribute_label(df_large, large_cols, df_small, small_cols=False, small_path=False): ''' Take labels from a shapefile that has larger boundaries and interpolate said labels to shapefile with smaller boundaries. By smaller boundaries we just mean more fine geographic boundaries. (i.e. census blocks are smaller than counties) We use the greatest area method. However, when no intersection occurs, we simply use the nearest centroid. NOTE: By default interpolates a string type because it is a label Arguments: df_large: larger shapefile giving the labels large_cols: LIST of attributes from larger shp to interpolate to smaller shp df_small: smaller shapefile receiving the labels small_cols: LIST of names for attributes given by larger columns. Default will be False, which means to use the same attribute names small_path: path to save the new dataframe to Output: edited df_small dataframe ''' # handle default for small_cols if small_cols is False: small_cols = large_cols # Check that large and small cols have same number of attributes if len(small_cols) != len(large_cols): return False if not set(large_cols).issubset(set(df_large.columns)): return False # Let the index by an integer for spatial indexing purposes df_large.index = df_large.index.astype(int) # Drop small_cols in small shp if they already exists drop_cols = set(small_cols).intersection(set(df_small.columns)) df_small = df_small.drop(columns=drop_cols) # Initialize new series in small shp for col in small_cols: df_small[col] = pd.Series(dtype=object) # construct r-tree spatial index si = df_large.sindex # Get centroid for each geometry in the large shapefile df_large['centroid'] = df_large['geometry'].centroid # Find appropriate matching large geometry for each small geometry for ix, row in df_small.iterrows(): # Get potential matches small_poly = row['geometry'] potential_matches = [ df_large.index[i] for i in list(si.intersection(small_poly.bounds)) ] # Only keep matches that have intersections matches = [ m for m in potential_matches if df_large.at[m, 'geometry'].intersection(small_poly).area > 0 ] # No intersections. Find nearest centroid if len(matches) == 0: small_centroid = small_poly.centroid dist_series = df_large['centroid'].apply( lambda x: small_centroid.distance(x)) large_ix = dist_series.idxmin() # One intersection. Only one match elif len(matches) == 1: large_ix = matches[0] # Multiple intersections. compare fractional area # of intersection else: area_df = df_large.loc[matches, :] area_series = area_df['geometry'].apply( lambda x: x.intersection(small_poly).area / small_poly.area) large_ix = area_series.idxmax() # Update values for the small geometry for j, col in enumerate(large_cols): df_small.at[ix, small_cols[j]] = df_large.at[large_ix, col] # Save and return the updated small dataframe if small_path: fm.save_shapefile(df_small, small_path) return df_small
def distribute_values(df_source, source_cols, df_target, target_cols=False, distribute_type='fractional', distribute_on='area', distribute_round=False, distribute_path=False): ''' Distribute attribute values of source geometries into the target geometries An example of this would be calculating population in generated precincts. We would take census blocks (the source geometries) and sum up their values into the precincts (target geometries). This is an example of aggregation Another example would be racial census tract data and disaggregating it to the census block level. This is an example of disaggregation There are two types of aggregation. fractional or winner take all. For disaggregation, we will rarely if ever use winner take all We can distribute values on area or on another attribute such as population However, theis aggregation attribute must be in the target dataframe For source geometries that do not intersect with any large geometries, we find the nearest centroid We give an option to round values and always retain totals. We als give an option to save the updated large shapefile if given a path Arguments: df_source: source shapefile providing the values to distribute source_cols: LIST of names of attributes in df_source to distribute df_target: target shapefile receiving values being distributed target_cols: LIST of names of attributes to create df_target. Elements in this list correpond to elements in source_cols with the same index. Default is just the name of the columns in the list source_cols distribute_type: 'fractional' or 'winner take all'. Self-explantory. default is 'fractional' distribute_on: Either area or an attribute in target_df to distribute values proportional to. For disaggregation usually do not want to use area as the distributing attribute. distribute_round: whether to round values. If True, then we will round values such that we retain totals. If False, will simply leave distributed values as floats distribute_path: path to save df_target to. Default is not to save Output: edited df_target dataframe''' # Handle default for target_cols if target_cols is False: target_cols = source_cols # Check that target_cols and source_cols have same number of attributes if len(source_cols) != len(target_cols): print('Different number of source_cols and target_cols') return False # Check that source_cols are actually in dataframe if not set(source_cols).issubset(set(df_source.columns)): print('source_cols are not in dataframe') return False # Check that the type is either fractional area or winner take all if distribute_type != 'fractional': if distribute_type != 'winner take all': print('incorrect aggregation type') return False # If we are not distributing on area check if the distributing attribute # is in the dataframe if distribute_on != 'area' and distribute_on not in df_target.columns: print('aggregation attribute not in dataframe') return False # Let the index of ths large dataframe be an integer for indexing purposes df_target.index = df_target.index.astype(int) # Drop target_cols in large shp drop_cols = set(target_cols).intersection(set(df_target.columns)) df_target = df_target.drop(columns=drop_cols) # Initialize the new series in the large shp for col in target_cols: df_target[col] = 0.0 # Ensure that source columns are floats for consisting adding for col in source_cols: df_source[col] = df_source[col].astype(float) # construct r-tree spatial index si = df_target.sindex # Get centroid for each geometry in target shapefile df_target['centroid'] = df_target['geometry'].centroid # Find appropriate match between geometries for ix, row in df_source.iterrows(): # initialize fractional area series, this will give what ratio to # aggregate to each target geometry frac_agg = pd.Series(dtype=float) # Get potential matches source_poly = row['geometry'] matches = [ df_target.index[i] for i in list(si.intersection(source_poly.bounds)) ] # Only keep matches that have intersections matches = [ m for m in matches if df_target.at[m, 'geometry'].intersection(source_poly).area > 0 ] # No intersections. Find nearest centroid if len(matches) == 0: source_centroid = source_poly.centroid dist_series = df_target['centroid'].apply( lambda x: source_centroid.distance(x)) frac_agg.at[dist_series.idxmin()] = 1 # Only one intersecting geometry elif len(matches) == 1: frac_agg.at[matches[0]] = 1 # More than one intersecting geometry else: agg_df = df_target.loc[matches, :] # Aggregate on proper column if distribute_on == 'area': frac_agg = agg_df['geometry'].apply(lambda x: x.intersection( source_poly).area / source_poly.area) # Add proportion that does not intersect to the target geometry # with the largest intersection leftover = 1 - frac_agg.sum() frac_agg.at[frac_agg.idxmax()] += leftover else: agg_df[distribute_on] = agg_df[distribute_on].astype(float) agg_col_sum = agg_df[distribute_on].sum() print(agg_col_sum) frac_agg = agg_df[distribute_on].apply( lambda x: float(x) / agg_col_sum) # Update value for target geometry depending on aggregate type for j, col in enumerate(target_cols): # Winner take all update if distribute_type == 'winner take all': target_ix = frac_agg.idxmax() df_target.loc[target_ix, col] += df_source.loc[ix, source_cols[j]] # Fractional update elif distribute_type == 'fractional': # Add the correct fraction for ix2, val in frac_agg.iteritems(): df_target.loc[ix2, col] += df_source.loc[ix, source_cols[j]] * val # Round if necessary if distribute_round: # round and find the indexes to round up round_down = df_target[col].apply(lambda x: np.floor(x)) decimal_val = df_target[col] - round_down n = int(np.round(decimal_val.sum())) round_up_ix = list(decimal_val.nlargest(n).index) # Round everything down and then increment the ones that # have the highest decimal value df_target[col] = round_down for ix3 in round_up_ix: df_target.loc[ix3, col] += 1 # Set column value as integer if distribute_round: df_target[col] = df_target[col].astype(int) # Save and return. also drop centroid attribute df_target = df_target.drop(columns=['centroid']) if distribute_path: fm.save_shapefile(df_target, distribute_path) return df_target
def image_classification(shp_path, img_path, num_regions, num_colors=False, out_path=False): '''Generate a dissolved boundary file of larger geometries according to how the geometry is colored in a corresponding image of the same geographic region. The image should be georeferened to the boundary. Also, the image should be cropped to the extents of the geometry. It is usually better to do this by hand because the autocropping algorithm sometimes stops because of a single pixel difference If there exists a one noncontiguous larger shape, then the number of regions should be one greater becausue the algorithm will split noncontiguous regions. We limit the number of samples to be 500 per geometry for speed purposes Arguments: shp_path: path to the shapefile the algorithm will be performed on img_path: path to the image we use for the classification. This should already be cropped to the boundaries of the geometry. This can be performed with the function cropped_border_image num_regions: the number of regions that should remain at the end of the algorithm. num_colors: The number of colors to reduce the image to. Sometimes helps if classification regions in the images are differently shades of the same color within a region. Defaul is no reduction out_path: path to save final dataframe if applicable. Default will not save Output: df_classified: dataframe with geometries classified into regions df: the original dataframe with color and region assignments ''' # Load image and shapefile img = Image.open(img_path) if num_colors: img = si.reduce_colors(img, num_colors) img_arr = np.asarray(img) df = fm.load_shapefile(shp_path) # create a color series and region series in the dataframe df['color'] = pd.Series(dtype=object) df['region'] = pd.Series(dtype=object) # Get the boundaries of the geodataframe bounds = shp.ops.cascaded_union(list(df['geometry'])).bounds shp_xlen = bounds[2] - bounds[0] shp_ylen = bounds[3] - bounds[1] shp_xmin = bounds[0] shp_ymin = bounds[1] # Assign each polygon and assign its most common color for ix, row in df.iterrows(): poly = row['geometry'] df.at[ix, 'color'] = si.most_common_color(poly, img_arr, shp_xmin, shp_xlen, shp_ymin, shp_ylen, 500) # Assign each polygon with a certain color a region index for ix, color in enumerate(df['color'].unique()): df.loc[df['color'] == color, 'region'] = ix # Get different region ids regions = list(df['region'].unique()) # Create the classification dataframe df_classified = pd.DataFrame(columns=['region', 'geometry']) # Create classification geoemtries for each region for ix, region in enumerate(regions): df_region = df[df['region'] == region] polys = list(df_region['geometry']) df_classified.at[ix, 'geometry'] = shp.ops.cascaded_union(polys) df_classified.at[ix, 'region'] = region # Convert clasified dataframe into a geodataframe df_classified = gpd.GeoDataFrame(df_classified, geometry='geometry') # # Split noncontiguous regions and merge fully contained regions df_classified = sm.split_noncontiguous(df_classified) df_classified = sm.merge_fully_contained(df_classified) # # Merge regions until we have the correct number df_classified = sm.merge_to_right_number(df_classified, num_regions) # save file if necessary if out_path: fm.save_shapefile(df_classified, out_path) return df_classified, df
def clean_manual_classification(in_path, classification_col, out_path=False): '''Generate a dissolved boundary file of larger geometries after being given a geodataframe with smaller geometries assigned to a value designated by the classification column. Will auto-assign unassigned geometries using the greedy shared perimeters method. Will also split non-contiguous geometries and merge fully contained geometries Usually used when a user has manually classified census blocks into precincts and needs to clean up their work Arguments: in_path: path dataframe containing smaller geometries classification_col: name of colum in df that identifies which larger "group" each smaller geometry belongs to. out_path: path to save final dataframe file if applicable. Default is false and will not save ''' df = fm.load_shapefile(in_path) # obtain unique values in classification column class_ids = list(df[classification_col].unique()) # determine the number of larger "groups" num_classes = len(class_ids) # Check if there are any unassigned census blocks if None in class_ids: # decrement number of regions because nan is not an actual region num_classes -= 1 # Assign unassigned blocks a unique dummy name for i, _ in df[df[classification_col].isnull()].iterrows(): df.at[i, classification_col] = 'foobar' + str(i) # Update the classes to include the dummy groups class_ids = list(df[classification_col].unique()) # Dissolve the boundaries given the group assignments for each small geom df = sm.dissolve(df, classification_col) # Split noncontiguous geometries after the dissolve df = sm.split_noncontiguous(df) # Merge geometries fully contained in other geometries df = sm.merge_fully_contained(df) # Get the correct number of regions df_nan = df[df[classification_col].str.slice(0, 6) == 'foobar'] ixs_to_merge = df_nan.index.to_list() df = sm.merge_geometries(df, ixs_to_merge) # drop neighbor column and reset the indexes df = df.drop(columns=['neighbors']) df = df.reset_index(drop=True) # save file if necessary if out_path: fm.save_shapefile(df, out_path) return df