def test_make_schema_string(self): pg = Postgresql() fields = {'gsid': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsrating': 'FLOAT'} primary_key = 'gsid' not_null_fields = ['gsid', 'zip_code', 'state', 'name'] schema_string = pg.make_schema_string(fields, primary_key=primary_key, not_null_fields=not_null_fields) assert schema_string == '(gsid INT PRIMARY KEY NOT NULL,zip_code INT NOT NULL,state TEXT NOT NULL,name TEXT NOT NULL,gsrating FLOAT)'
def __init__(self, recreate=False): datamodel = Datamodel() self.table, self.table_config = datamodel.population() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config) self.googlegeo = GoogleGeo() # myan: only get major cities data once per request self.major_cities_postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.major_cities = self.major_cities_postgres.get( "select * from TestMajorCities") self.all_states = self.major_cities['state'].values self.all_cities = self.major_cities['city'].values self.all_lats = self.major_cities['lat'].values self.all_lngs = self.major_cities['lng'].values self.all_population = self.major_cities['population'].values
def test_parse_list_values(self): pg = Postgresql() insert = pg.parse_values_list( [{ 'gsId': 100, 'name': 'Max', 'zip_code': 123, 'state': 'CA', 'gsRating': 3.5 }, { 'gsId': 101, 'name': 'Tez', 'zip_code': 123, 'state': 'CA', 'gsRating': 8.5 }], { 'gsId': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsRating': 'FLOAT' }, field_list=['gsId', 'name', 'gsRating']) assert insert == "(100, 'Max', 3.5),(101, 'Tez', 8.5)"
def test_parse_list_values(self): pg = Postgresql() insert = pg.parse_values_list([{'gsId': 100, 'name': 'Max', 'zip_code': 123, 'state': 'CA', 'gsRating': 3.5}, {'gsId': 101, 'name': 'Tez', 'zip_code': 123, 'state': 'CA', 'gsRating': 8.5}], {'gsId': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsRating': 'FLOAT'}, field_list=['gsId', 'name', 'gsRating']) assert insert == "(100, 'Max', 3.5),(101, 'Tez', 8.5)"
def __init__(self, key=None): if key is None: self.api_key = _get_great_schools_api_key() else: self.api_key = key # myan: initialize postgresql datamodel = Datamodel() self.table, self.table_config = datamodel.great_schools() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config)
def test_make_schema_string(self): pg = Postgresql() fields = { 'gsid': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsrating': 'FLOAT' } primary_key = 'gsid' not_null_fields = ['gsid', 'zip_code', 'state', 'name'] schema_string = pg.make_schema_string(fields, primary_key=primary_key, not_null_fields=not_null_fields) assert schema_string == '(gsid INT PRIMARY KEY NOT NULL,zip_code INT NOT NULL,state TEXT NOT NULL,name TEXT NOT NULL,gsrating FLOAT)'
def push_major_cities(): all_cities = requests.get('http://api.census.gov/data/2013/acs3?get=NAME,B01001_001E&for=place:*') major_cities, _ = _parse_city_population(all_cities.json()) major_cities = major_cities.loc[major_cities.population >= 100000] # myan: major city should have more than 100000 major_cities = _get_city_info(major_cities) postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') field_types = {'place_id': 'TEXT', 'city': 'TEXT', 'state': 'TEXT', 'population': 'INT', 'lat': 'FLOAT', 'lng': 'FLOAT'} postgres.initialize_table('TestMajorCities', fields_types=field_types, primary_key='place_id', not_null_fields=['place_id', 'city', 'state'], recreate=False) postgres.put_dataframe(major_cities, field_types)
def push_major_cities(): all_cities = requests.get( 'http://api.census.gov/data/2013/acs3?get=NAME,B01001_001E&for=place:*' ) major_cities, _ = _parse_city_population(all_cities.json()) major_cities = major_cities.loc[ major_cities.population >= 100000] # myan: major city should have more than 100000 major_cities = _get_city_info(major_cities) postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') field_types = { 'place_id': 'TEXT', 'city': 'TEXT', 'state': 'TEXT', 'population': 'INT', 'lat': 'FLOAT', 'lng': 'FLOAT' } postgres.initialize_table('TestMajorCities', fields_types=field_types, primary_key='place_id', not_null_fields=['place_id', 'city', 'state'], recreate=False) postgres.put_dataframe(major_cities, field_types)
def __init__(self,recreate=False): datamodel = Datamodel() self.table, self.table_config = datamodel.population() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config) self.googlegeo = GoogleGeo() # myan: only get major cities data once per request self.major_cities_postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.major_cities = self.major_cities_postgres.get("select * from TestMajorCities") self.all_states = self.major_cities['state'].values self.all_cities = self.major_cities['city'].values self.all_lats = self.major_cities['lat'].values self.all_lngs = self.major_cities['lng'].values self.all_population = self.major_cities['population'].values
class ZillowDataHandler: def __init__(self, table=None, recreate=False): datamodel = Datamodel() if table is None: table, table_config = datamodel.zipcode_timeseries() else: _, table_config = datamodel.zipcode_timeseries() self.time_series_postgres = self._initialize_postgres( (table, table_config), recreate=recreate) def _initialize_postgres(self, (table, config), recreate=False): postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') postgres.initialize_table(table, recreate=recreate, **config) return postgres def get_data_from_zillow(self): # TODO: complete downloading data from web part pass def _drop_columns(self, data, all_columns=('RegionID', 'City', 'Metro', 'SizeRank')): to_drop = [] for column in all_columns: if column in data.columns:
from UCB_MIDS_W205.Project.postgresql_handler import Postgresql DEFAULT_PATH = '/home/myan/Downloads/Zillow_Data' class ZillowDataHandler: def __init__(self, table=None, recreate=False): datamodel = Datamodel() if table is None: table, table_config = datamodel.zipcode_timeseries() else: _, table_config = datamodel.zipcode_timeseries() self.time_series_postgres = self._initialize_postgres((table, table_config), recreate=recreate) def _initialize_postgres(self, (table, config), recreate=False): postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') postgres.initialize_table(table, recreate=recreate, **config) return postgres def get_data_from_zillow(self): # TODO: complete downloading data from web part pass def _drop_columns(self, data, all_columns=('RegionID', 'City', 'Metro', 'SizeRank')): to_drop = [] for column in all_columns: if column in data.columns: to_drop.append(column) data = data.drop(to_drop, axis=1) return data
class Population: """ This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings. See more information on: http://www.greatschools.org/api/docs/main.page """ def __init__(self,recreate=False): datamodel = Datamodel() self.table, self.table_config = datamodel.population() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config) self.googlegeo = GoogleGeo() # myan: only get major cities data once per request self.major_cities_postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.major_cities = self.major_cities_postgres.get("select * from TestMajorCities") self.all_states = self.major_cities['state'].values self.all_cities = self.major_cities['city'].values self.all_lats = self.major_cities['lat'].values self.all_lngs = self.major_cities['lng'].values self.all_population = self.major_cities['population'].values def run(self, **kwargs): # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to # include. results_df = pd.DataFrame(self._closest_city_population(self._geo_info(**kwargs))) existing_keys = self.postgres.get("select place_id from {table};".format(table=self.table)) addition_results = results_df.loc[np.logical_not(results_df['place_id'].isin(existing_keys['place_id'].values))] addition_results.drop_duplicates(subset='place_id', inplace=True) if len(addition_results) > 0: self.postgres.put_dataframe(addition_results, self.table_config['fields_types'], table=self.table) return results_df def _geo_info(self, addresses=None, fields_to_get=('place_id', 'state', 'city', 'county', 'lat', 'lng')): """ Get geo info from Google API Args: addresses: list of addresses Returns: list of dict, [{field1:value1, field2:value2, ...}, {...]] Examples: p = Population() results = p._geo_info(address=['Houston,TX', 'Dallas, TX']) """ if fields_to_get is None: raise ValueError('Argument fields_to_get must not be None.') results = [] for entry in addresses: output = self.googlegeo.get(entry, fields_to_get=fields_to_get) output.update(address=str(entry)) if isinstance(entry, int): output.update(zip_code=entry) results.append(output) return results def _closest_city_population(self, tmp_results): for entry in tmp_results: is_curr_state = self.all_states == entry['state'] same_state_cities = self.all_cities[is_curr_state] # myan: use a simple squared distance between two points to simply get the minimum euc_distance = (self.all_lats[is_curr_state] - entry['lat']) ** 2 + (self.all_lngs[is_curr_state] - entry['lng']) ** 2 closest_idx = euc_distance.argmin() # myan: once we locate the closest major city, we can then calculate the actual haversine distance haversine_distance = haversine((self.all_lats[is_curr_state][closest_idx], self.all_lngs[is_curr_state][closest_idx]), (entry['lat'], entry['lng'])) if haversine_distance <= MAX_DISTANCE_KM: entry.update(closest_city=same_state_cities[closest_idx]) entry.update(closest_city_population=self.all_population[is_curr_state][closest_idx]) else: entry.update(closest_city='NULL') entry.update(closest_city_population='NULL') return tmp_results
class Population: """ This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings. See more information on: http://www.greatschools.org/api/docs/main.page """ def __init__(self, recreate=False): datamodel = Datamodel() self.table, self.table_config = datamodel.population() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config) self.googlegeo = GoogleGeo() # myan: only get major cities data once per request self.major_cities_postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.major_cities = self.major_cities_postgres.get( "select * from TestMajorCities") self.all_states = self.major_cities['state'].values self.all_cities = self.major_cities['city'].values self.all_lats = self.major_cities['lat'].values self.all_lngs = self.major_cities['lng'].values self.all_population = self.major_cities['population'].values def run(self, **kwargs): # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to # include. results_df = pd.DataFrame( self._closest_city_population(self._geo_info(**kwargs))) existing_keys = self.postgres.get( "select place_id from {table};".format(table=self.table)) addition_results = results_df.loc[np.logical_not( results_df['place_id'].isin(existing_keys['place_id'].values))] addition_results.drop_duplicates(subset='place_id', inplace=True) if len(addition_results) > 0: self.postgres.put_dataframe(addition_results, self.table_config['fields_types'], table=self.table) return results_df def _geo_info(self, addresses=None, fields_to_get=('place_id', 'state', 'city', 'county', 'lat', 'lng')): """ Get geo info from Google API Args: addresses: list of addresses Returns: list of dict, [{field1:value1, field2:value2, ...}, {...]] Examples: p = Population() results = p._geo_info(address=['Houston,TX', 'Dallas, TX']) """ if fields_to_get is None: raise ValueError('Argument fields_to_get must not be None.') results = [] for entry in addresses: output = self.googlegeo.get(entry, fields_to_get=fields_to_get) output.update(address=str(entry)) if isinstance(entry, int): output.update(zip_code=entry) results.append(output) return results def _closest_city_population(self, tmp_results): for entry in tmp_results: is_curr_state = self.all_states == entry['state'] same_state_cities = self.all_cities[is_curr_state] # myan: use a simple squared distance between two points to simply get the minimum euc_distance = (self.all_lats[is_curr_state] - entry['lat'])**2 + ( self.all_lngs[is_curr_state] - entry['lng'])**2 closest_idx = euc_distance.argmin() # myan: once we locate the closest major city, we can then calculate the actual haversine distance haversine_distance = haversine( (self.all_lats[is_curr_state][closest_idx], self.all_lngs[is_curr_state][closest_idx]), (entry['lat'], entry['lng'])) if haversine_distance <= MAX_DISTANCE_KM: entry.update(closest_city=same_state_cities[closest_idx]) entry.update(closest_city_population=self. all_population[is_curr_state][closest_idx]) else: entry.update(closest_city='NULL') entry.update(closest_city_population='NULL') return tmp_results
class GreatSchools: """ This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings. See more information on: http://www.greatschools.org/api/docs/main.page """ def __init__(self, key=None): if key is None: self.api_key = _get_great_schools_api_key() else: self.api_key = key # myan: initialize postgresql datamodel = Datamodel() self.table, self.table_config = datamodel.great_schools() self.postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') self.postgres.initialize_table(self.table, recreate=False, **self.table_config) def set_api_key(self, key=None): self.api_key = key def run(self, **kwargs): # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to # include. tmp_results = self._nearby_schools(**kwargs) results = [] existing_keys = self.postgres.get("select gsid from {table};".format(table=self.table)) for entry in tmp_results: if len(existing_keys) < 1 or entry['gsid'] not in existing_keys['gsid'].values: results.append(entry) self._push(results) return results def _push(self, data, batch_size=500): fields_list = list(self.table_config['fields_types'].keys()) fields_to_push = self.postgres.construct_db_field_string(fields_list) start_idx = 0 while start_idx < len(data): end_idx = min(len(data), start_idx + batch_size) values_to_insert = self.postgres.parse_values_list(data[start_idx:end_idx], self.table_config['fields_types'], fields_list) start_idx = end_idx self.postgres.put(self.table, fields=fields_to_push, values=values_to_insert) def _nearby_schools(self, state=None, zip_code=None, radius=5, limit=10): """ Gets a list of schools for a specified physical location (i.e. state + zip_code), within a certain radius Args: state: zip_code: radius: limit: Returns: list, [dict(gsId=int, name=string, gsRating=float), dict(...), ...] Examples: gs = GreatSchools(key='Your GS Key') results = gs._nearby_schools(state='TX', zip_code=75228, limit=2) # [{'gsId': '1769', 'gsRating': '3', 'name': 'Bryan Adams High School'}, {'gsId': '7566', 'name': 'White Rock Montessori School'}] """ self._check_key() url = "http://api.greatschools.org/schools/nearby?key={key}&state={state}&radius={radius}&zip={zip_code}&limit={limit}".format( key=self.api_key, state=state, zip_code=zip_code, radius=radius, limit=limit) results = self._run(url, key_string='school', result_fields=[(int, 'gsId'), (None, 'name'), (float, 'gsRating')], zip_code=zip_code, state=state) return results def _run(self, url, key_string="school", result_fields=None, zip_code=None, state=None): """ Generic method to extract data from API calls Args: url: string, the API call url to retrieve data key_string: string, the parent field in the XML file result_fields: list, [(func, field), ...] where func can be int, float etc. Returns: list, [dict(field_1=value_1, field_2=value2, ...), dict(...)] """ nearby = requests.get(url) results = [] for school in ElementTree.fromstring(nearby.content).findall(key_string): curr_result = dict(zip_code=zip_code, state=state) try: for (func, field) in result_fields: if func is None: curr_result[field.lower()] = school.find(field).text else: curr_result[field.lower()] = func(school.find(field).text) except: pass if curr_result: results.append(curr_result) return results def _check_key(self): if self.api_key is None: raise ValueError("Use .set_api_key() method to set Great School API Keys first.")
class Plotter: def __init__(self, min_price=150000, max_price=300000, top_percentage=0.25, top_max_num_entries=30): self.MIN_PRICE = min_price self.MAX_PRICE = max_price self.TOP_PERCENTAGE = top_percentage self.TOP_MAX_NUM_ENTRIES = top_max_num_entries datamodel = Datamodel() self.time_series_postgres = self._initialize_postgres( datamodel.zipcode_timeseries()) self.population_postgres = self._initialize_postgres( datamodel.population()) self.great_schools_postgres = self._initialize_postgres( datamodel.great_schools()) self.zipcode_timeseries = None self.top_zipcodes_timeseries = None self.rest_zipcodes_timeseries = None self.top_zipcodes_school_data = None self.top_zipcodes_population_data = None self.all_months_timeseries = None def reset(self): self.zipcode_timeseries = None self.top_zipcodes_timeseries = None self.rest_zipcodes_timeseries = None self.top_zipcodes_school_data = None self.top_zipcodes_population_data = None self.all_months_timeseries = None def _get_time_series_data(self, year_month=201510): suitable_states = self.time_series_postgres.get( "select * from {table} where year_month={year_month} and median_price < {max_price} and median_price > {min_price}" .format(table=self.time_series_postgres.table, year_month=year_month, max_price=self.MAX_PRICE, min_price=self.MIN_PRICE)) suitable_states[ 'gross_yield_pct'] = suitable_states.median_rent * 12 / suitable_states.median_price * 100 suitable_states = suitable_states.sort('gross_yield_pct', ascending=False) top_index = min(int(len(suitable_states) * self.TOP_PERCENTAGE), self.TOP_MAX_NUM_ENTRIES) self.zipcode_timeseries = suitable_states self.top_zipcodes_timeseries = suitable_states[:top_index] self.rest_zipcodes_timeseries = suitable_states[top_index:] def _get_population_data(self): if self.top_zipcodes_timeseries is None: self._get_time_series_data() p = Population(recreate=False) self.top_zipcodes_population_data = p.run( addresses=self.top_zipcodes_timeseries.zip_code.values) def _get_top_zipcodes_school_data(self): all_states = self.top_zipcodes_timeseries.state.values all_zipcodes = self.top_zipcodes_timeseries.zip_code.values unique_states = self.top_zipcodes_timeseries.state.unique() requests = [] for state in unique_states: unique_zipcodes = np.unique(all_zipcodes[all_states == state]) for zip_code in unique_zipcodes: requests.append( dict(db_configs=dict( postgres=self.great_schools_postgres, query= "select * from {table} where state='{state}' and zip_code={zip_code};" .format(table=self.great_schools_postgres.table, state=state, zip_code=zip_code)), api_configs=dict(api=GreatSchools, api_key=None, api_args=dict(state=state, zip_code=zip_code, limit=20)))) mission_control = MissionControl() gs_data = mission_control.request_data(user_requests=requests) gs_data_df = pd.DataFrame() for entry in gs_data: gs_data_df = gs_data_df.append(entry) self.top_zipcodes_school_data = pd.DataFrame( gs_data_df.groupby('zip_code')['gsrating'].mean()) self.top_zipcodes_school_data[ 'zip_code'] = self.top_zipcodes_school_data.index def _initialize_postgres(self, (table, config)): postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject') postgres.initialize_table(table, recreate=False, **config) return postgres