Esempio n. 1
0
 def test_make_schema_string(self):
     pg = Postgresql()
     fields = {'gsid': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsrating': 'FLOAT'}
     primary_key = 'gsid'
     not_null_fields = ['gsid', 'zip_code', 'state', 'name']
     schema_string = pg.make_schema_string(fields, primary_key=primary_key, not_null_fields=not_null_fields)
     assert schema_string == '(gsid INT PRIMARY KEY NOT NULL,zip_code INT NOT NULL,state TEXT NOT NULL,name TEXT NOT NULL,gsrating FLOAT)'
Esempio n. 2
0
    def __init__(self, recreate=False):
        datamodel = Datamodel()
        self.table, self.table_config = datamodel.population()
        self.postgres = Postgresql(user_name='postgres',
                                   password='******',
                                   host='localhost',
                                   port='5432',
                                   db='TestProject')
        self.postgres.initialize_table(self.table,
                                       recreate=False,
                                       **self.table_config)
        self.googlegeo = GoogleGeo()

        # myan: only get major cities data once per request
        self.major_cities_postgres = Postgresql(user_name='postgres',
                                                password='******',
                                                host='localhost',
                                                port='5432',
                                                db='TestProject')
        self.major_cities = self.major_cities_postgres.get(
            "select * from TestMajorCities")
        self.all_states = self.major_cities['state'].values
        self.all_cities = self.major_cities['city'].values
        self.all_lats = self.major_cities['lat'].values
        self.all_lngs = self.major_cities['lng'].values
        self.all_population = self.major_cities['population'].values
Esempio n. 3
0
    def test_parse_list_values(self):
        pg = Postgresql()
        insert = pg.parse_values_list(
            [{
                'gsId': 100,
                'name': 'Max',
                'zip_code': 123,
                'state': 'CA',
                'gsRating': 3.5
            }, {
                'gsId': 101,
                'name': 'Tez',
                'zip_code': 123,
                'state': 'CA',
                'gsRating': 8.5
            }], {
                'gsId': 'INT',
                'zip_code': 'INT',
                'state': 'TEXT',
                'name': 'TEXT',
                'gsRating': 'FLOAT'
            },
            field_list=['gsId', 'name', 'gsRating'])

        assert insert == "(100, 'Max', 3.5),(101, 'Tez', 8.5)"
Esempio n. 4
0
    def test_parse_list_values(self):
        pg = Postgresql()
        insert = pg.parse_values_list([{'gsId': 100, 'name': 'Max', 'zip_code': 123, 'state': 'CA', 'gsRating': 3.5},
                                       {'gsId': 101, 'name': 'Tez', 'zip_code': 123, 'state': 'CA', 'gsRating': 8.5}],
                                      {'gsId': 'INT', 'zip_code': 'INT', 'state': 'TEXT', 'name': 'TEXT', 'gsRating': 'FLOAT'},
                                      field_list=['gsId', 'name', 'gsRating'])

        assert insert == "(100, 'Max', 3.5),(101, 'Tez', 8.5)"
Esempio n. 5
0
 def __init__(self, key=None):
     if key is None:
         self.api_key = _get_great_schools_api_key()
     else:
         self.api_key = key
     # myan: initialize postgresql
     datamodel = Datamodel()
     self.table, self.table_config = datamodel.great_schools()
     self.postgres = Postgresql(user_name='postgres',
                                password='******',
                                host='localhost',
                                port='5432',
                                db='TestProject')
     self.postgres.initialize_table(self.table, recreate=False, **self.table_config)
Esempio n. 6
0
 def test_make_schema_string(self):
     pg = Postgresql()
     fields = {
         'gsid': 'INT',
         'zip_code': 'INT',
         'state': 'TEXT',
         'name': 'TEXT',
         'gsrating': 'FLOAT'
     }
     primary_key = 'gsid'
     not_null_fields = ['gsid', 'zip_code', 'state', 'name']
     schema_string = pg.make_schema_string(fields,
                                           primary_key=primary_key,
                                           not_null_fields=not_null_fields)
     assert schema_string == '(gsid INT PRIMARY KEY NOT NULL,zip_code INT NOT NULL,state TEXT NOT NULL,name TEXT NOT NULL,gsrating FLOAT)'
def push_major_cities():
    all_cities = requests.get('http://api.census.gov/data/2013/acs3?get=NAME,B01001_001E&for=place:*')
    major_cities, _ = _parse_city_population(all_cities.json())
    major_cities = major_cities.loc[major_cities.population >= 100000]  # myan: major city should have more than 100000
    major_cities = _get_city_info(major_cities)

    postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432', db='TestProject')
    field_types = {'place_id': 'TEXT', 'city': 'TEXT', 'state': 'TEXT', 'population': 'INT',
                   'lat': 'FLOAT', 'lng': 'FLOAT'}
    postgres.initialize_table('TestMajorCities',
                              fields_types=field_types,
                              primary_key='place_id',
                              not_null_fields=['place_id', 'city', 'state'],
                              recreate=False)
    postgres.put_dataframe(major_cities, field_types)
def push_major_cities():
    all_cities = requests.get(
        'http://api.census.gov/data/2013/acs3?get=NAME,B01001_001E&for=place:*'
    )
    major_cities, _ = _parse_city_population(all_cities.json())
    major_cities = major_cities.loc[
        major_cities.population >=
        100000]  # myan: major city should have more than 100000
    major_cities = _get_city_info(major_cities)

    postgres = Postgresql(user_name='postgres',
                          password='******',
                          host='localhost',
                          port='5432',
                          db='TestProject')
    field_types = {
        'place_id': 'TEXT',
        'city': 'TEXT',
        'state': 'TEXT',
        'population': 'INT',
        'lat': 'FLOAT',
        'lng': 'FLOAT'
    }
    postgres.initialize_table('TestMajorCities',
                              fields_types=field_types,
                              primary_key='place_id',
                              not_null_fields=['place_id', 'city', 'state'],
                              recreate=False)
    postgres.put_dataframe(major_cities, field_types)
Esempio n. 9
0
    def __init__(self,recreate=False):
        datamodel = Datamodel()
        self.table, self.table_config = datamodel.population()
        self.postgres = Postgresql(user_name='postgres',
                                   password='******',
                                   host='localhost',
                                   port='5432',
                                   db='TestProject')
        self.postgres.initialize_table(self.table, recreate=False, **self.table_config)
        self.googlegeo = GoogleGeo()

        # myan: only get major cities data once per request
        self.major_cities_postgres = Postgresql(user_name='postgres',
                                                password='******',
                                                host='localhost',
                                                port='5432',
                                                db='TestProject')
        self.major_cities = self.major_cities_postgres.get("select * from TestMajorCities")
        self.all_states = self.major_cities['state'].values
        self.all_cities = self.major_cities['city'].values
        self.all_lats = self.major_cities['lat'].values
        self.all_lngs = self.major_cities['lng'].values
        self.all_population = self.major_cities['population'].values
Esempio n. 10
0

class ZillowDataHandler:
    def __init__(self, table=None, recreate=False):
        datamodel = Datamodel()
        if table is None:
            table, table_config = datamodel.zipcode_timeseries()
        else:
            _, table_config = datamodel.zipcode_timeseries()
        self.time_series_postgres = self._initialize_postgres(
            (table, table_config), recreate=recreate)

    def _initialize_postgres(self, (table, config), recreate=False):
        postgres = Postgresql(user_name='postgres',
                              password='******',
                              host='localhost',
                              port='5432',
                              db='TestProject')
        postgres.initialize_table(table, recreate=recreate, **config)
        return postgres

    def get_data_from_zillow(self):
        # TODO: complete downloading data from web part
        pass

    def _drop_columns(self,
                      data,
                      all_columns=('RegionID', 'City', 'Metro', 'SizeRank')):
        to_drop = []
        for column in all_columns:
            if column in data.columns:
Esempio n. 11
0
from UCB_MIDS_W205.Project.postgresql_handler import Postgresql

DEFAULT_PATH = '/home/myan/Downloads/Zillow_Data'


class ZillowDataHandler:
    def __init__(self, table=None, recreate=False):
        datamodel = Datamodel()
        if table is None:
            table, table_config = datamodel.zipcode_timeseries()
        else:
            _, table_config = datamodel.zipcode_timeseries()
        self.time_series_postgres = self._initialize_postgres((table, table_config), recreate=recreate)

    def _initialize_postgres(self, (table, config), recreate=False):
        postgres = Postgresql(user_name='postgres', password='******', host='localhost', port='5432',
                              db='TestProject')
        postgres.initialize_table(table, recreate=recreate, **config)
        return postgres

    def get_data_from_zillow(self):
        # TODO: complete downloading data from web part
        pass

    def _drop_columns(self, data, all_columns=('RegionID', 'City', 'Metro', 'SizeRank')):
        to_drop = []
        for column in all_columns:
            if column in data.columns:
                to_drop.append(column)
        data = data.drop(to_drop, axis=1)
        return data
Esempio n. 12
0
class Population:
    """
    This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings.
    See more information on: http://www.greatschools.org/api/docs/main.page
    """

    def __init__(self,recreate=False):
        datamodel = Datamodel()
        self.table, self.table_config = datamodel.population()
        self.postgres = Postgresql(user_name='postgres',
                                   password='******',
                                   host='localhost',
                                   port='5432',
                                   db='TestProject')
        self.postgres.initialize_table(self.table, recreate=False, **self.table_config)
        self.googlegeo = GoogleGeo()

        # myan: only get major cities data once per request
        self.major_cities_postgres = Postgresql(user_name='postgres',
                                                password='******',
                                                host='localhost',
                                                port='5432',
                                                db='TestProject')
        self.major_cities = self.major_cities_postgres.get("select * from TestMajorCities")
        self.all_states = self.major_cities['state'].values
        self.all_cities = self.major_cities['city'].values
        self.all_lats = self.major_cities['lat'].values
        self.all_lngs = self.major_cities['lng'].values
        self.all_population = self.major_cities['population'].values

    def run(self, **kwargs):
        # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop
        # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to
        # include.
        results_df = pd.DataFrame(self._closest_city_population(self._geo_info(**kwargs)))
        existing_keys = self.postgres.get("select place_id from {table};".format(table=self.table))
        addition_results = results_df.loc[np.logical_not(results_df['place_id'].isin(existing_keys['place_id'].values))]
        addition_results.drop_duplicates(subset='place_id', inplace=True)
        if len(addition_results) > 0:
            self.postgres.put_dataframe(addition_results, self.table_config['fields_types'], table=self.table)
        return results_df

    def _geo_info(self, addresses=None, fields_to_get=('place_id', 'state', 'city', 'county', 'lat', 'lng')):
        """
        Get geo info from Google API
        Args:
            addresses: list of addresses

        Returns:
            list of dict, [{field1:value1, field2:value2, ...}, {...]]

        Examples:
            p = Population()
            results = p._geo_info(address=['Houston,TX', 'Dallas, TX'])
        """

        if fields_to_get is None:
            raise ValueError('Argument fields_to_get must not be None.')
        results = []
        for entry in addresses:
            output = self.googlegeo.get(entry, fields_to_get=fields_to_get)
            output.update(address=str(entry))
            if isinstance(entry, int):
                output.update(zip_code=entry)
            results.append(output)
        return results

    def _closest_city_population(self, tmp_results):
        for entry in tmp_results:
            is_curr_state = self.all_states == entry['state']
            same_state_cities = self.all_cities[is_curr_state]
            # myan: use a simple squared distance between two points to simply get the minimum
            euc_distance = (self.all_lats[is_curr_state] - entry['lat']) ** 2 + (self.all_lngs[is_curr_state] - entry['lng']) ** 2
            closest_idx = euc_distance.argmin()
            # myan: once we locate the closest major city, we can then calculate the actual haversine distance
            haversine_distance = haversine((self.all_lats[is_curr_state][closest_idx], self.all_lngs[is_curr_state][closest_idx]),
                                           (entry['lat'], entry['lng']))
            if haversine_distance <= MAX_DISTANCE_KM:
                entry.update(closest_city=same_state_cities[closest_idx])
                entry.update(closest_city_population=self.all_population[is_curr_state][closest_idx])
            else:
                entry.update(closest_city='NULL')
                entry.update(closest_city_population='NULL')
        return tmp_results
Esempio n. 13
0
class Population:
    """
    This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings.
    See more information on: http://www.greatschools.org/api/docs/main.page
    """
    def __init__(self, recreate=False):
        datamodel = Datamodel()
        self.table, self.table_config = datamodel.population()
        self.postgres = Postgresql(user_name='postgres',
                                   password='******',
                                   host='localhost',
                                   port='5432',
                                   db='TestProject')
        self.postgres.initialize_table(self.table,
                                       recreate=False,
                                       **self.table_config)
        self.googlegeo = GoogleGeo()

        # myan: only get major cities data once per request
        self.major_cities_postgres = Postgresql(user_name='postgres',
                                                password='******',
                                                host='localhost',
                                                port='5432',
                                                db='TestProject')
        self.major_cities = self.major_cities_postgres.get(
            "select * from TestMajorCities")
        self.all_states = self.major_cities['state'].values
        self.all_cities = self.major_cities['city'].values
        self.all_lats = self.major_cities['lat'].values
        self.all_lngs = self.major_cities['lng'].values
        self.all_population = self.major_cities['population'].values

    def run(self, **kwargs):
        # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop
        # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to
        # include.
        results_df = pd.DataFrame(
            self._closest_city_population(self._geo_info(**kwargs)))
        existing_keys = self.postgres.get(
            "select place_id from {table};".format(table=self.table))
        addition_results = results_df.loc[np.logical_not(
            results_df['place_id'].isin(existing_keys['place_id'].values))]
        addition_results.drop_duplicates(subset='place_id', inplace=True)
        if len(addition_results) > 0:
            self.postgres.put_dataframe(addition_results,
                                        self.table_config['fields_types'],
                                        table=self.table)
        return results_df

    def _geo_info(self,
                  addresses=None,
                  fields_to_get=('place_id', 'state', 'city', 'county', 'lat',
                                 'lng')):
        """
        Get geo info from Google API
        Args:
            addresses: list of addresses

        Returns:
            list of dict, [{field1:value1, field2:value2, ...}, {...]]

        Examples:
            p = Population()
            results = p._geo_info(address=['Houston,TX', 'Dallas, TX'])
        """

        if fields_to_get is None:
            raise ValueError('Argument fields_to_get must not be None.')
        results = []
        for entry in addresses:
            output = self.googlegeo.get(entry, fields_to_get=fields_to_get)
            output.update(address=str(entry))
            if isinstance(entry, int):
                output.update(zip_code=entry)
            results.append(output)
        return results

    def _closest_city_population(self, tmp_results):
        for entry in tmp_results:
            is_curr_state = self.all_states == entry['state']
            same_state_cities = self.all_cities[is_curr_state]
            # myan: use a simple squared distance between two points to simply get the minimum
            euc_distance = (self.all_lats[is_curr_state] - entry['lat'])**2 + (
                self.all_lngs[is_curr_state] - entry['lng'])**2
            closest_idx = euc_distance.argmin()
            # myan: once we locate the closest major city, we can then calculate the actual haversine distance
            haversine_distance = haversine(
                (self.all_lats[is_curr_state][closest_idx],
                 self.all_lngs[is_curr_state][closest_idx]),
                (entry['lat'], entry['lng']))
            if haversine_distance <= MAX_DISTANCE_KM:
                entry.update(closest_city=same_state_cities[closest_idx])
                entry.update(closest_city_population=self.
                             all_population[is_curr_state][closest_idx])
            else:
                entry.update(closest_city='NULL')
                entry.update(closest_city_population='NULL')
        return tmp_results
Esempio n. 14
0
class GreatSchools:
    """
    This object connects to the GreatSchools.org API and retrieves information about schools and GS ratings.
    See more information on: http://www.greatschools.org/api/docs/main.page
    """

    def __init__(self, key=None):
        if key is None:
            self.api_key = _get_great_schools_api_key()
        else:
            self.api_key = key
        # myan: initialize postgresql
        datamodel = Datamodel()
        self.table, self.table_config = datamodel.great_schools()
        self.postgres = Postgresql(user_name='postgres',
                                   password='******',
                                   host='localhost',
                                   port='5432',
                                   db='TestProject')
        self.postgres.initialize_table(self.table, recreate=False, **self.table_config)

    def set_api_key(self, key=None):
        self.api_key = key

    def run(self, **kwargs):
        # myan: seems python has a strange way of handling memory pointers when deleting elements from lists in a loop
        # therefore create a separate list tmp_results to hold all the results from API calls first and decide what to
        # include.
        tmp_results = self._nearby_schools(**kwargs)
        results = []
        existing_keys = self.postgres.get("select gsid from {table};".format(table=self.table))
        for entry in tmp_results:
            if len(existing_keys) < 1 or entry['gsid'] not in existing_keys['gsid'].values:
                results.append(entry)
        self._push(results)
        return results

    def _push(self, data, batch_size=500):
        fields_list = list(self.table_config['fields_types'].keys())
        fields_to_push = self.postgres.construct_db_field_string(fields_list)
        start_idx = 0
        while start_idx < len(data):
            end_idx = min(len(data), start_idx + batch_size)
            values_to_insert = self.postgres.parse_values_list(data[start_idx:end_idx],
                                                               self.table_config['fields_types'],
                                                               fields_list)
            start_idx = end_idx
            self.postgres.put(self.table, fields=fields_to_push, values=values_to_insert)

    def _nearby_schools(self, state=None, zip_code=None, radius=5, limit=10):
        """
        Gets a list of schools for a specified physical location (i.e. state + zip_code), within a certain radius
        Args:
            state:
            zip_code:
            radius:
            limit:

        Returns:
            list, [dict(gsId=int, name=string, gsRating=float), dict(...), ...]

        Examples:
            gs = GreatSchools(key='Your GS Key')
            results = gs._nearby_schools(state='TX', zip_code=75228, limit=2)
            # [{'gsId': '1769', 'gsRating': '3', 'name': 'Bryan Adams High School'}, {'gsId': '7566', 'name': 'White Rock Montessori School'}]
        """
        self._check_key()
        url = "http://api.greatschools.org/schools/nearby?key={key}&state={state}&radius={radius}&zip={zip_code}&limit={limit}".format(
            key=self.api_key,
            state=state,
            zip_code=zip_code,
            radius=radius,
            limit=limit)

        results = self._run(url,
                            key_string='school',
                            result_fields=[(int, 'gsId'), (None, 'name'), (float, 'gsRating')],
                            zip_code=zip_code,
                            state=state)
        return results

    def _run(self, url, key_string="school", result_fields=None, zip_code=None, state=None):
        """
        Generic method to extract data from API calls
        Args:
            url: string, the API call url to retrieve data
            key_string: string, the parent field in the XML file
            result_fields: list, [(func, field), ...] where func can be int, float etc.

        Returns:
            list, [dict(field_1=value_1, field_2=value2, ...), dict(...)]
        """
        nearby = requests.get(url)
        results = []
        for school in ElementTree.fromstring(nearby.content).findall(key_string):
            curr_result = dict(zip_code=zip_code, state=state)
            try:
                for (func, field) in result_fields:
                    if func is None:
                        curr_result[field.lower()] = school.find(field).text
                    else:
                        curr_result[field.lower()] = func(school.find(field).text)
            except:
                pass
            if curr_result:
                results.append(curr_result)
        return results

    def _check_key(self):
        if self.api_key is None:
            raise ValueError("Use .set_api_key() method to set Great School API Keys first.")
Esempio n. 15
0
class Plotter:
    def __init__(self,
                 min_price=150000,
                 max_price=300000,
                 top_percentage=0.25,
                 top_max_num_entries=30):
        self.MIN_PRICE = min_price
        self.MAX_PRICE = max_price
        self.TOP_PERCENTAGE = top_percentage
        self.TOP_MAX_NUM_ENTRIES = top_max_num_entries

        datamodel = Datamodel()
        self.time_series_postgres = self._initialize_postgres(
            datamodel.zipcode_timeseries())
        self.population_postgres = self._initialize_postgres(
            datamodel.population())
        self.great_schools_postgres = self._initialize_postgres(
            datamodel.great_schools())

        self.zipcode_timeseries = None
        self.top_zipcodes_timeseries = None
        self.rest_zipcodes_timeseries = None
        self.top_zipcodes_school_data = None
        self.top_zipcodes_population_data = None
        self.all_months_timeseries = None

    def reset(self):
        self.zipcode_timeseries = None
        self.top_zipcodes_timeseries = None
        self.rest_zipcodes_timeseries = None
        self.top_zipcodes_school_data = None
        self.top_zipcodes_population_data = None
        self.all_months_timeseries = None

    def _get_time_series_data(self, year_month=201510):
        suitable_states = self.time_series_postgres.get(
            "select * from {table} where year_month={year_month} and median_price < {max_price} and median_price > {min_price}"
            .format(table=self.time_series_postgres.table,
                    year_month=year_month,
                    max_price=self.MAX_PRICE,
                    min_price=self.MIN_PRICE))

        suitable_states[
            'gross_yield_pct'] = suitable_states.median_rent * 12 / suitable_states.median_price * 100
        suitable_states = suitable_states.sort('gross_yield_pct',
                                               ascending=False)
        top_index = min(int(len(suitable_states) * self.TOP_PERCENTAGE),
                        self.TOP_MAX_NUM_ENTRIES)

        self.zipcode_timeseries = suitable_states
        self.top_zipcodes_timeseries = suitable_states[:top_index]
        self.rest_zipcodes_timeseries = suitable_states[top_index:]

    def _get_population_data(self):
        if self.top_zipcodes_timeseries is None:
            self._get_time_series_data()
        p = Population(recreate=False)
        self.top_zipcodes_population_data = p.run(
            addresses=self.top_zipcodes_timeseries.zip_code.values)

    def _get_top_zipcodes_school_data(self):
        all_states = self.top_zipcodes_timeseries.state.values
        all_zipcodes = self.top_zipcodes_timeseries.zip_code.values
        unique_states = self.top_zipcodes_timeseries.state.unique()

        requests = []
        for state in unique_states:
            unique_zipcodes = np.unique(all_zipcodes[all_states == state])
            for zip_code in unique_zipcodes:
                requests.append(
                    dict(db_configs=dict(
                        postgres=self.great_schools_postgres,
                        query=
                        "select * from {table} where state='{state}' and zip_code={zip_code};"
                        .format(table=self.great_schools_postgres.table,
                                state=state,
                                zip_code=zip_code)),
                         api_configs=dict(api=GreatSchools,
                                          api_key=None,
                                          api_args=dict(state=state,
                                                        zip_code=zip_code,
                                                        limit=20))))
        mission_control = MissionControl()
        gs_data = mission_control.request_data(user_requests=requests)
        gs_data_df = pd.DataFrame()
        for entry in gs_data:
            gs_data_df = gs_data_df.append(entry)
        self.top_zipcodes_school_data = pd.DataFrame(
            gs_data_df.groupby('zip_code')['gsrating'].mean())
        self.top_zipcodes_school_data[
            'zip_code'] = self.top_zipcodes_school_data.index

    def _initialize_postgres(self, (table, config)):
        postgres = Postgresql(user_name='postgres',
                              password='******',
                              host='localhost',
                              port='5432',
                              db='TestProject')
        postgres.initialize_table(table, recreate=False, **config)
        return postgres