Beispiel #1
0
    def __init__(self):
        self.features = pd.DataFrame()
        self.fsmooth = pd.DataFrame()

        self.latmin = 37.70784
        self.latmax = 37.8195
        self.lonmin = -122.5185
        self.lonmax = -122.35454

        self.latbins = np.linspace(self.latmin, self.latmax, 101)
        self.lonbins = np.linspace(self.lonmin, self.lonmax, 101)

        self.shapefile = self.window(get_db('usc_shapefile'))

        self.nodelat = self.shapefile.lat
        self.nodelon = self.shapefile.lon

        self.edges = None

        self.allfeatures = [
            'taxable_value', 'grocery', 'restaurant', 'retail', 'ncrimes',
            'sgnf', 'avg_hh_size', 'population', 'walkscore'
        ]

        self.alltables = [
            'assessment', 'business', 'sfpd', 'usc_age_gender',
            'usc_household', 'usc_pop', 'walkscore'
        ]

        self.featurenames = {
            'taxable_value': 'Property Value',
            'grocery': 'Grocery',
            'restaurant': 'Restaurants',
            'retail': 'Retail',
            'ncrimes': 'Crime',
            'sgnf': 'Female:Male ratio',
            'avg_hh_size': 'Household Size',
            'population': 'Population',
            'walkscore': 'Walkscore'
        }

        self.smoothing = {
            'taxable_value': 0.01,
            'grocery': 0.1,
            'restaurant': 0.01,
            'retail': 0.3,
            'ncrimes': 0.1,
            'sgnf': 0.01,
            'avg_hh_size': 0.1,
            'population': 1,
            'walkscore': 0
        }
Beispiel #2
0
    def __init__(self):
        self.features = pd.DataFrame()
        self.fsmooth = pd.DataFrame()

        # San Francisco city boundaries
        self.latmin = 37.70784
        self.latmax = 37.8195
        self.lonmin = -122.5185
        self.lonmax = -122.35454

        self.latbins = np.linspace(self.latmin, self.latmax, 101)
        self.lonbins = np.linspace(self.lonmin, self.lonmax, 101)

        self.shapefile = self.window(get_db('usc_shapefile'))

        self.nodelat = self.shapefile.lat
        self.nodelon = self.shapefile.lon

        self.edges = None

        # list of features available, feature names
        self.allfeatures = ['taxable_value', 'grocery', 'restaurant',
                            'retail', 'ncrimes', 'sgnf',
                            'avg_hh_size', 'population', 'walkscore']

        # list of all sql tables used
        self.alltables = ['assessment', 'business', 'sfpd',
                          'usc_age_gender', 'usc_household',
                          'usc_pop', 'walkscore']

        # Formatted feature name dict
        self.featurenames = {'taxable_value': 'Property Value',
                             'grocery': 'Grocery',
                             'restaurant': 'Restaurants',
                             'retail': 'Retail',
                             'ncrimes': 'Crime',
                             'sgnf': 'Female:Male ratio',
                             'avg_hh_size': 'Household Size',
                             'population': 'Population',
                             'walkscore': 'Walkscore'}

        # smoothing level for each feature
        self.smoothing = {'taxable_value': 0.01,
                          'grocery': 0.1,
                          'restaurant': 0.01,
                          'retail': 0.3,
                          'ncrimes': 0.1,
                          'sgnf': 0.01,
                          'avg_hh_size': 0.1,
                          'population': 1,
                          'walkscore': 0}
Beispiel #3
0
    def add_features(self, flist, how='usc', verbose=False):

        for f in flist:
            if verbose:
                print 'loading ', f
                sys.stdout.flush()

            # load database table
            df1 = get_db(f)

            # merge in lat/lon for census data from shapefile
            if f in ['usc_age_gender', 'usc_household', 'usc_pop']:
                df1 = df1.merge(self.shapefile,
                                left_on='id2',
                                right_on='geoid')

            df1 = self.window(df1)

            # handling each table type
            if f == 'assessment':
                df1 = self.binlatlon(df1)
                df1 = df1[['lat_cut', 'lon_cut', 'taxable_value']]
                df1 = df1.groupby(['lat_cut', 'lon_cut']).mean()
                df1 = df1.reset_index().dropna()
                df1.columns = ['lat', 'lon', 'taxable_value']

            elif f == 'business':
                df1 = self.binlatlon(df1)
                df1 = df1[['lat_cut', 'lon_cut', 'category']]
                df1['count'] = 1
                df1 = df1.groupby(['lat_cut', 'lon_cut', 'category']).count()
                df1 = df1.reset_index().dropna()
                df2 = df1.pivot(columns='category', values='count').fillna(0)
                df1 = df1.merge(df2, left_index=True, right_index=True)
                df1.drop('category', axis=1, inplace=True)
                df1 = df1.groupby(['lat_cut', 'lon_cut']).sum().reset_index()
                df1 = df1[[
                    'lat_cut', 'lon_cut', 'grocery', 'restaurant', 'retail'
                ]]
                df1.columns = ['lat', 'lon', 'grocery', 'restaurant', 'retail']

            elif f == 'sfpd':
                df1 = self.binlatlon(df1)
                df1['ncrimes'] = 1
                df1 = df1.groupby(['lat_cut', 'lon_cut']).count()
                df1 = df1.dropna().reset_index()
                df1 = df1[['lat_cut', 'lon_cut', 'ncrimes']]
                df1.columns = ['lat', 'lon', 'ncrimes']

            elif f == 'usc_age_gender':
                df1['sgnf'] = (2 * df1.f / df1.total).fillna(0) - 1
                df1 = df1[['lat', 'lon', 'sgnf']]

            elif f == 'usc_household':

                # calc average household size
                total_p = 0
                p_range = range(1, 8)
                for i in p_range:
                    col = 'p' + str(i)
                    total_p += df1[col] * i
                av_p = total_p / df1.total
                df1['avg_hh_size'] = av_p
                df1.fillna(0, inplace=True)

                df1 = df1[['lat', 'lon', 'avg_hh_size']]

            elif f == 'usc_pop':
                df1 = df1[['lat', 'lon', 'total']]
                df1.columns = ['lat', 'lon', 'population']

            elif f == 'walkscore':
                df1 = df1[['lat', 'lon', 'walkscore']]

            # append results to final data frame
            for col in df1.columns[2:]:
                finterp = bin_interpolate(df1.lon, df1.lat, df1[col],
                                          self.nodelon, self.nodelat)
                finterp = pd.Series(finterp, name=col)

                if self.features.shape == (0, 0):
                    self.features = pd.concat(
                        (self.nodelat, self.nodelon, finterp), axis=1)
                else:
                    self.features = pd.concat((self.features, finterp), axis=1)
Beispiel #4
0
    def add_features(self, flist, verbose=False):
        """
        Load and featurize selected features

        Args:
            featurelist : list of names of features to plot.
                available names in self.allfeatures, list of strings
            verbose : whether to print status to console, boolean
        Returns:
            None
        """

        for f in flist:
            if verbose:
                print 'loading ', f
                sys.stdout.flush()

            # load database table
            df1 = get_db(f)

            # merge in lat/lon for census data from shapefile
            if f in ['usc_age_gender', 'usc_household', 'usc_pop']:
                df1 = df1.merge(self.shapefile, left_on='id2',
                                right_on='geoid')
            df1 = self.window(df1)

            # handling each table type
            if f == 'assessment':
                df1 = self.binlatlon(df1)
                df1 = df1[['lat_cut', 'lon_cut', 'taxable_value']]
                df1 = df1.groupby(['lat_cut', 'lon_cut']).mean()
                df1 = df1.reset_index().dropna()
                df1.columns = ['lat', 'lon', 'taxable_value']

            elif f == 'business':
                df1 = self.binlatlon(df1)
                df1 = df1[['lat_cut', 'lon_cut', 'category']]
                df1['count'] = 1
                df1 = df1.groupby(['lat_cut', 'lon_cut', 'category']).count()
                df1 = df1.reset_index().dropna()
                df2 = df1.pivot(columns='category',
                                values='count').fillna(0)
                df1 = df1.merge(df2, left_index=True, right_index=True)
                df1.drop('category', axis=1, inplace=True)
                df1 = df1.groupby(['lat_cut', 'lon_cut']).sum().reset_index()
                df1 = df1[['lat_cut', 'lon_cut', 'grocery',
                           'restaurant', 'retail']]
                df1.columns = ['lat', 'lon', 'grocery',
                               'restaurant', 'retail']

            elif f == 'sfpd':
                df1 = self.binlatlon(df1)
                df1['ncrimes'] = 1
                df1 = df1.groupby(['lat_cut', 'lon_cut']).count()
                df1 = df1.dropna().reset_index()
                df1 = df1[['lat_cut', 'lon_cut', 'ncrimes']]
                df1.columns = ['lat', 'lon', 'ncrimes']

            elif f == 'usc_age_gender':
                df1['sgnf'] = (2 * df1.f / df1.total).fillna(0) - 1
                df1 = df1[['lat', 'lon', 'sgnf']]

            elif f == 'usc_household':

                # calc average household size
                total_p = 0
                p_range = range(1, 8)
                for i in p_range:
                    col = 'p' + str(i)
                    total_p += df1[col] * i
                av_p = total_p / df1.total
                df1['avg_hh_size'] = av_p
                df1.fillna(0, inplace=True)

                df1 = df1[['lat', 'lon', 'avg_hh_size']]

            elif f == 'usc_pop':
                df1 = df1[['lat', 'lon', 'total']]
                df1.columns = ['lat', 'lon', 'population']

            elif f == 'walkscore':
                df1 = df1[['lat', 'lon', 'walkscore']]

            # append results to final data frame
            for col in df1.columns[2:]:
                finterp = bin_interpolate(df1.lon, df1.lat, df1[col],
                                          self.nodelon, self.nodelat)
                finterp = pd.Series(finterp, name=col)

                if self.features.shape == (0, 0):
                    self.features = pd.concat((self.nodelat, self.nodelon,
                                               finterp), axis=1)
                else:
                    self.features = pd.concat((self.features, finterp),
                                              axis=1)