class Crime(): def __init__(self): self.p = Postgres(False) def get_data(self): query = """ SELECT * FROM congressional_districts_data """ res = self.p.query_list(query) d = pd.DataFrame(res, columns=[ 'gid', 'gid_theft', 'gid_not_theft', 'neigh_theft', 'neigh_not_theft' ]) return d def plot(self, x, y, y_hat): plt.scatter(x, y, color='black') plt.plot(x, y_hat, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show() def run_spatial_ols(self): data = self.get_data() y = data['gid_theft'].values x = data[['gid_not_theft', 'neigh_not_theft']] y.resize(len(y), 1) regr = linear_model.LinearRegression() regr.fit(x, y) print "spatial ols", mse(y, regr.predict(x)) # 9888290.8158 def run_ols(self): data = self.get_data() y = data['gid_theft'].values x = data['gid_not_theft'].values y.resize(len(y), 1) x.resize(len(x), 1) regr = linear_model.LinearRegression() regr.fit(x, y) print "ols", mse(y, regr.predict(x)) # 10987133.1401 def run(self): pass
class Crime(): def __init__(self): self.name = 'Ben' self.p = Postgres() def get_data(self): query = """ SELECT CAST(date AS DATE), COUNT(CASE WHEN primary_type IN ('THEFT', 'BURGLARY', 'ROBBERY', 'MOTOR VEHICLE THEFT') THEN 1 END), COUNT(CASE WHEN primary_type NOT IN ('THEFT', 'BURGLARY', 'ROBBERY', 'MOTOR VEHICLE THEFT') THEN 1 END) FROM crimes GROUP BY 1 """ res = self.p.query_list(query) d = pd.DataFrame(res, columns=['date', 'theft', 'not_theft']) return d def plot(self, x, y, y_hat): plt.scatter(x, y, color='black') plt.plot(x, y_hat, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show() def run_ols(self): data = self.get_data() # print data y = data['theft'].values x = data['not_theft'].values y.resize(len(y),1) x.resize(len(x),1) # print y # print y.values, x.shape, x.dtype # print x.values, y.shape, y.dtype # print type(x) # print type(y) # # print y, x # print y.values # print x.values m1 = ps.spreg.OLS(y, x, name_x=['not_theft'], name_y='theft') print(m1.summary) print mse(y, m1.predy.flatten()) # self.plot(x, y, m1.predy) def run(self): pass
class Crime(): def __init__(self): self.p = Postgres(False) self.cross_val = { 1: { 'train': { 'start': '2016-10-01', 'end': '2017-03-31' }, 'test': { 'start': '2017-04-01', 'end': '2017-04-30' } }, 2: { 'train': { 'start': '2016-09-01', 'end': '2017-02-28' }, 'test': { 'start': '2017-03-01', 'end': '2017-03-31' } }, 3: { 'train': { 'start': '2016-08-01', 'end': '2017-01-31' }, 'test': { 'start': '2017-02-01', 'end': '2017-02-28' } }, 4: { 'train': { 'start': '2016-07-01', 'end': '2016-12-31' }, 'test': { 'start': '2017-01-01', 'end': '2017-01-31' } }, 5: { 'train': { 'start': '2016-06-01', 'end': '2016-11-30' }, 'test': { 'start': '2016-12-01', 'end': '2016-12-31' } }, 6: { 'train': { 'start': '2016-05-01', 'end': '2016-10-31' }, 'test': { 'start': '2016-11-01', 'end': '2016-11-30' } }, 7: { 'train': { 'start': '2016-04-01', 'end': '2016-09-30' }, 'test': { 'start': '2016-10-01', 'end': '2016-10-31' } }, 8: { 'train': { 'start': '2016-03-01', 'end': '2016-08-31' }, 'test': { 'start': '2016-09-01', 'end': '2016-09-30' } }, 9: { 'train': { 'start': '2016-02-01', 'end': '2016-07-31' }, 'test': { 'start': '2016-08-01', 'end': '2016-08-31' } }, 10: { 'train': { 'start': '2016-01-01', 'end': '2016-06-30' }, 'test': { 'start': '2016-07-01', 'end': '2016-07-31' } }, } def get_cv(self, future_days, training_days): days_to_run = [] future_days = future_days - 1 # print future_days, training_days for iterations in range(0): base = randint(1, 380) rand_date_in_range = datetime(2016, 3, 1) + timedelta(base) days_to_run.append({ 'train': { 'start': (rand_date_in_range - timedelta(training_days)).strftime('%Y-%m-%d'), 'end': (rand_date_in_range - timedelta(1)).strftime('%Y-%m-%d') }, 'test': { # 'rand_date_in_range': rand_date_in_range.strftime('%Y-%m-%d'), 'start': (rand_date_in_range + timedelta(future_days)).strftime('%Y-%m-%d'), 'end': (rand_date_in_range + timedelta(future_days)).strftime('%Y-%m-%d') } }) # pprint.pprint(days_to_run) return days_to_run def get_data(self, dates, shape): final_res = {'train': None, 'test': None} # print dates # exit(0) for data_set in ['train', 'test']: query = """ SELECT * FROM ( SELECT dt, gid, gid_crime_theft, LAG(gid_lights_ally_homeless_false, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_lights_ally_homeless_false, LAG(gid_lights_ally_homeless_true, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_lights_ally_homeless_true, LAG(gid_building_violations, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_building_violations, LAG(gid_crime_non_theft, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_crime_non_theft, LAG(gid_crime_theft, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_crime_theft, LAG(gid_food_fail, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_food_fail, LAG(gid_food_pass, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_food_pass, LAG(gid_food_pass_w_conditions, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_food_pass_w_conditions, LAG(gid_graffitti, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_graffitti, LAG(gid_sanitation_requests, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_sanitation_requests, LAG(gid_vacant_gang_false, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_vacant_gang_false, LAG(gid_vacant_gang_true, 1) OVER (partition by gid ORDER BY dt ASC) prev_gid_vacant_gang_true, LAG(neigh_lights_ally_homeless_false, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_lights_ally_homeless_false, LAG(neigh_lights_ally_homeless_true, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_lights_ally_homeless_true, LAG(neigh_building_violations, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_building_violations, LAG(neigh_crime_non_theft, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_crime_non_theft, LAG(neigh_crime_theft, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_crime_theft, LAG(neigh_food_fail, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_food_fail, LAG(neigh_food_pass, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_food_pass, LAG(neigh_food_pass_w_conditions, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_food_pass_w_conditions, LAG(neigh_graffitti, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_graffitti, LAG(neigh_sanitation_requests, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_sanitation_requests, LAG(neigh_vacant_gang_false, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_vacant_gang_false, LAG(neigh_vacant_gang_true, 1) OVER (partition by gid ORDER BY dt ASC) prev_neigh_vacant_gang_true, gid_tweets_bad, gid_tweets_good, gid_liquor_licenses, gid_red_light_tickets, neigh_tweets_bad, neigh_tweets_good, neigh_liquor_licenses, neigh_red_light_tickets FROM final_data.%s ) C WHERE prev_gid_crime_theft IS NOT NULL AND dt >= '%s' AND dt <= '%s' """ % (shape, dates[data_set]['start'], dates[data_set]['end']) # print query res = self.p.query_list(query) d = pd.DataFrame( res, columns=[ 'dt', 'gid', 'gid_crime_theft', 'prev_gid_lights_ally_homeless_false', 'prev_gid_lights_ally_homeless_true', 'prev_gid_building_violations', 'prev_gid_crime_non_theft', 'prev_gid_crime_theft', 'prev_gid_food_fail', 'prev_gid_food_pass', 'prev_gid_food_pass_w_conditions', 'prev_gid_graffitti', 'prev_gid_sanitation_requests', 'prev_gid_vacant_gang_false', 'prev_gid_vacant_gang_true', 'prev_neigh_lights_ally_homeless_false', 'prev_neigh_lights_ally_homeless_true', 'prev_neigh_building_violations', 'prev_neigh_crime_non_theft', 'prev_neigh_crime_theft', 'prev_neigh_food_fail', 'prev_neigh_food_pass', 'prev_neigh_food_pass_w_conditions', 'prev_neigh_graffitti', 'prev_neigh_sanitation_requests', 'prev_neigh_vacant_gang_false', 'prev_neigh_vacant_gang_true', 'gid_tweets_bad', 'gid_tweets_good', 'gid_liquor_licenses', 'gid_red_light_tickets', 'neigh_tweets_bad', 'neigh_tweets_good', 'neigh_liquor_licenses', 'neigh_red_light_tickets' ]) final_res[data_set] = d return final_res def plot(self, x, y, y_hat): plt.scatter(x, y, color='black') plt.plot(x, y_hat, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show() def run_spatial_ols(self, dates, shape): data = self.get_data(dates, shape) y_train = data['train']['gid_crime_theft'].values x_train = data['train'][[ 'prev_gid_lights_ally_homeless_false', 'prev_gid_lights_ally_homeless_true', 'prev_gid_building_violations', 'prev_gid_crime_non_theft', 'prev_gid_crime_theft', 'prev_gid_food_fail', 'prev_gid_food_pass', 'prev_gid_food_pass_w_conditions', 'prev_gid_graffitti', 'prev_gid_sanitation_requests', 'prev_gid_vacant_gang_false', 'prev_gid_vacant_gang_true', 'prev_neigh_lights_ally_homeless_false', 'prev_neigh_lights_ally_homeless_true', 'prev_neigh_building_violations', 'prev_neigh_crime_non_theft', 'prev_neigh_crime_theft', 'prev_neigh_food_fail', 'prev_neigh_food_pass', 'prev_neigh_food_pass_w_conditions', 'prev_neigh_graffitti', 'prev_neigh_sanitation_requests', 'prev_neigh_vacant_gang_false', 'prev_neigh_vacant_gang_true', 'gid_tweets_bad', 'gid_tweets_good', 'gid_liquor_licenses', 'gid_red_light_tickets', 'neigh_tweets_bad', 'neigh_tweets_good', 'neigh_liquor_licenses', 'neigh_red_light_tickets' ]] y_test = data['test']['gid_crime_theft'].values x_test = data['test'][[ 'prev_gid_lights_ally_homeless_false', 'prev_gid_lights_ally_homeless_true', 'prev_gid_building_violations', 'prev_gid_crime_non_theft', 'prev_gid_crime_theft', 'prev_gid_food_fail', 'prev_gid_food_pass', 'prev_gid_food_pass_w_conditions', 'prev_gid_graffitti', 'prev_gid_sanitation_requests', 'prev_gid_vacant_gang_false', 'prev_gid_vacant_gang_true', 'prev_neigh_lights_ally_homeless_false', 'prev_neigh_lights_ally_homeless_true', 'prev_neigh_building_violations', 'prev_neigh_crime_non_theft', 'prev_neigh_crime_theft', 'prev_neigh_food_fail', 'prev_neigh_food_pass', 'prev_neigh_food_pass_w_conditions', 'prev_neigh_graffitti', 'prev_neigh_sanitation_requests', 'prev_neigh_vacant_gang_false', 'prev_neigh_vacant_gang_true', 'gid_tweets_bad', 'gid_tweets_good', 'gid_liquor_licenses', 'gid_red_light_tickets', 'neigh_tweets_bad', 'neigh_tweets_good', 'neigh_liquor_licenses', 'neigh_red_light_tickets' ]] y_train.resize(len(y_train), 1) y_test.resize(len(y_test), 1) regr = linear_model.LinearRegression() regr.fit(x_train, y_train) # print regr.summary() # print "spatial ols", mse(y_test, regr.predict(x_test)) print regr.predict(x_test) return mse(y_test, regr.predict(x_test)) # 9888290.8158 def run_ols(self, dates, shape): data = self.get_data(dates, shape) y_train = data['train']['gid_crime_theft'].values x_train = data['train'][[ 'prev_gid_lights_ally_homeless_false', 'prev_gid_lights_ally_homeless_true', 'prev_gid_building_violations', 'prev_gid_crime_non_theft', 'prev_gid_crime_theft', 'prev_gid_food_fail', 'prev_gid_food_pass', 'prev_gid_food_pass_w_conditions', 'prev_gid_graffitti', 'prev_gid_sanitation_requests', 'prev_gid_vacant_gang_false', 'prev_gid_vacant_gang_true', 'gid_tweets_bad', 'gid_tweets_good', 'gid_liquor_licenses', 'gid_red_light_tickets', ]] y_test = data['test']['gid_crime_theft'].values x_test = data['test'][[ 'prev_gid_lights_ally_homeless_false', 'prev_gid_lights_ally_homeless_true', 'prev_gid_building_violations', 'prev_gid_crime_non_theft', 'prev_gid_crime_theft', 'prev_gid_food_fail', 'prev_gid_food_pass', 'prev_gid_food_pass_w_conditions', 'prev_gid_graffitti', 'prev_gid_sanitation_requests', 'prev_gid_vacant_gang_false', 'prev_gid_vacant_gang_true', 'gid_tweets_bad', 'gid_tweets_good', 'gid_liquor_licenses', 'gid_red_light_tickets', ]] y_train.resize(len(y_train), 1) # x.resize(len(x),1) regr = linear_model.LinearRegression() regr.fit(x_train, y_train) # print "ols", mse(y_test, regr.predict(x_test)) return mse(y_test, regr.predict(x_test)) # 10987133.1401 def run(self): pass