def __init__(self): self.features = pd.DataFrame() self.fsmooth = pd.DataFrame() self.latmin = 37.70784 self.latmax = 37.8195 self.lonmin = -122.5185 self.lonmax = -122.35454 self.latbins = np.linspace(self.latmin, self.latmax, 101) self.lonbins = np.linspace(self.lonmin, self.lonmax, 101) self.shapefile = self.window(get_db('usc_shapefile')) self.nodelat = self.shapefile.lat self.nodelon = self.shapefile.lon self.edges = None self.allfeatures = [ 'taxable_value', 'grocery', 'restaurant', 'retail', 'ncrimes', 'sgnf', 'avg_hh_size', 'population', 'walkscore' ] self.alltables = [ 'assessment', 'business', 'sfpd', 'usc_age_gender', 'usc_household', 'usc_pop', 'walkscore' ] self.featurenames = { 'taxable_value': 'Property Value', 'grocery': 'Grocery', 'restaurant': 'Restaurants', 'retail': 'Retail', 'ncrimes': 'Crime', 'sgnf': 'Female:Male ratio', 'avg_hh_size': 'Household Size', 'population': 'Population', 'walkscore': 'Walkscore' } self.smoothing = { 'taxable_value': 0.01, 'grocery': 0.1, 'restaurant': 0.01, 'retail': 0.3, 'ncrimes': 0.1, 'sgnf': 0.01, 'avg_hh_size': 0.1, 'population': 1, 'walkscore': 0 }
def __init__(self): self.features = pd.DataFrame() self.fsmooth = pd.DataFrame() # San Francisco city boundaries self.latmin = 37.70784 self.latmax = 37.8195 self.lonmin = -122.5185 self.lonmax = -122.35454 self.latbins = np.linspace(self.latmin, self.latmax, 101) self.lonbins = np.linspace(self.lonmin, self.lonmax, 101) self.shapefile = self.window(get_db('usc_shapefile')) self.nodelat = self.shapefile.lat self.nodelon = self.shapefile.lon self.edges = None # list of features available, feature names self.allfeatures = ['taxable_value', 'grocery', 'restaurant', 'retail', 'ncrimes', 'sgnf', 'avg_hh_size', 'population', 'walkscore'] # list of all sql tables used self.alltables = ['assessment', 'business', 'sfpd', 'usc_age_gender', 'usc_household', 'usc_pop', 'walkscore'] # Formatted feature name dict self.featurenames = {'taxable_value': 'Property Value', 'grocery': 'Grocery', 'restaurant': 'Restaurants', 'retail': 'Retail', 'ncrimes': 'Crime', 'sgnf': 'Female:Male ratio', 'avg_hh_size': 'Household Size', 'population': 'Population', 'walkscore': 'Walkscore'} # smoothing level for each feature self.smoothing = {'taxable_value': 0.01, 'grocery': 0.1, 'restaurant': 0.01, 'retail': 0.3, 'ncrimes': 0.1, 'sgnf': 0.01, 'avg_hh_size': 0.1, 'population': 1, 'walkscore': 0}
def add_features(self, flist, how='usc', verbose=False): for f in flist: if verbose: print 'loading ', f sys.stdout.flush() # load database table df1 = get_db(f) # merge in lat/lon for census data from shapefile if f in ['usc_age_gender', 'usc_household', 'usc_pop']: df1 = df1.merge(self.shapefile, left_on='id2', right_on='geoid') df1 = self.window(df1) # handling each table type if f == 'assessment': df1 = self.binlatlon(df1) df1 = df1[['lat_cut', 'lon_cut', 'taxable_value']] df1 = df1.groupby(['lat_cut', 'lon_cut']).mean() df1 = df1.reset_index().dropna() df1.columns = ['lat', 'lon', 'taxable_value'] elif f == 'business': df1 = self.binlatlon(df1) df1 = df1[['lat_cut', 'lon_cut', 'category']] df1['count'] = 1 df1 = df1.groupby(['lat_cut', 'lon_cut', 'category']).count() df1 = df1.reset_index().dropna() df2 = df1.pivot(columns='category', values='count').fillna(0) df1 = df1.merge(df2, left_index=True, right_index=True) df1.drop('category', axis=1, inplace=True) df1 = df1.groupby(['lat_cut', 'lon_cut']).sum().reset_index() df1 = df1[[ 'lat_cut', 'lon_cut', 'grocery', 'restaurant', 'retail' ]] df1.columns = ['lat', 'lon', 'grocery', 'restaurant', 'retail'] elif f == 'sfpd': df1 = self.binlatlon(df1) df1['ncrimes'] = 1 df1 = df1.groupby(['lat_cut', 'lon_cut']).count() df1 = df1.dropna().reset_index() df1 = df1[['lat_cut', 'lon_cut', 'ncrimes']] df1.columns = ['lat', 'lon', 'ncrimes'] elif f == 'usc_age_gender': df1['sgnf'] = (2 * df1.f / df1.total).fillna(0) - 1 df1 = df1[['lat', 'lon', 'sgnf']] elif f == 'usc_household': # calc average household size total_p = 0 p_range = range(1, 8) for i in p_range: col = 'p' + str(i) total_p += df1[col] * i av_p = total_p / df1.total df1['avg_hh_size'] = av_p df1.fillna(0, inplace=True) df1 = df1[['lat', 'lon', 'avg_hh_size']] elif f == 'usc_pop': df1 = df1[['lat', 'lon', 'total']] df1.columns = ['lat', 'lon', 'population'] elif f == 'walkscore': df1 = df1[['lat', 'lon', 'walkscore']] # append results to final data frame for col in df1.columns[2:]: finterp = bin_interpolate(df1.lon, df1.lat, df1[col], self.nodelon, self.nodelat) finterp = pd.Series(finterp, name=col) if self.features.shape == (0, 0): self.features = pd.concat( (self.nodelat, self.nodelon, finterp), axis=1) else: self.features = pd.concat((self.features, finterp), axis=1)
def add_features(self, flist, verbose=False): """ Load and featurize selected features Args: featurelist : list of names of features to plot. available names in self.allfeatures, list of strings verbose : whether to print status to console, boolean Returns: None """ for f in flist: if verbose: print 'loading ', f sys.stdout.flush() # load database table df1 = get_db(f) # merge in lat/lon for census data from shapefile if f in ['usc_age_gender', 'usc_household', 'usc_pop']: df1 = df1.merge(self.shapefile, left_on='id2', right_on='geoid') df1 = self.window(df1) # handling each table type if f == 'assessment': df1 = self.binlatlon(df1) df1 = df1[['lat_cut', 'lon_cut', 'taxable_value']] df1 = df1.groupby(['lat_cut', 'lon_cut']).mean() df1 = df1.reset_index().dropna() df1.columns = ['lat', 'lon', 'taxable_value'] elif f == 'business': df1 = self.binlatlon(df1) df1 = df1[['lat_cut', 'lon_cut', 'category']] df1['count'] = 1 df1 = df1.groupby(['lat_cut', 'lon_cut', 'category']).count() df1 = df1.reset_index().dropna() df2 = df1.pivot(columns='category', values='count').fillna(0) df1 = df1.merge(df2, left_index=True, right_index=True) df1.drop('category', axis=1, inplace=True) df1 = df1.groupby(['lat_cut', 'lon_cut']).sum().reset_index() df1 = df1[['lat_cut', 'lon_cut', 'grocery', 'restaurant', 'retail']] df1.columns = ['lat', 'lon', 'grocery', 'restaurant', 'retail'] elif f == 'sfpd': df1 = self.binlatlon(df1) df1['ncrimes'] = 1 df1 = df1.groupby(['lat_cut', 'lon_cut']).count() df1 = df1.dropna().reset_index() df1 = df1[['lat_cut', 'lon_cut', 'ncrimes']] df1.columns = ['lat', 'lon', 'ncrimes'] elif f == 'usc_age_gender': df1['sgnf'] = (2 * df1.f / df1.total).fillna(0) - 1 df1 = df1[['lat', 'lon', 'sgnf']] elif f == 'usc_household': # calc average household size total_p = 0 p_range = range(1, 8) for i in p_range: col = 'p' + str(i) total_p += df1[col] * i av_p = total_p / df1.total df1['avg_hh_size'] = av_p df1.fillna(0, inplace=True) df1 = df1[['lat', 'lon', 'avg_hh_size']] elif f == 'usc_pop': df1 = df1[['lat', 'lon', 'total']] df1.columns = ['lat', 'lon', 'population'] elif f == 'walkscore': df1 = df1[['lat', 'lon', 'walkscore']] # append results to final data frame for col in df1.columns[2:]: finterp = bin_interpolate(df1.lon, df1.lat, df1[col], self.nodelon, self.nodelat) finterp = pd.Series(finterp, name=col) if self.features.shape == (0, 0): self.features = pd.concat((self.nodelat, self.nodelon, finterp), axis=1) else: self.features = pd.concat((self.features, finterp), axis=1)