def geocode(self): """Update latitude, longitude, rating and ZIP in Locations table.""" print('\nGeocoding...') null_rating_rows = self.get_rows_with_null_rating() for row in null_rating_rows: full_address = "{0} {1}, New Orleans, LA".format( row.street_number, row.address) result = self.gmaps.geocode(full_address) if len(result) == 0: log.info('No geocoding results for: {}'.format(full_address)) # TODO: Need to also note failure so future geocoding scripts # don't keep trying and failing on the same addresses. # Possibly update Location's `rating` and/or Cleaned's # `location_publish` fields. continue details = self.process_google_results(result) try: with SESSION.begin_nested(): u = update(Location) u = u.values(details) u = u.where(Location.document_id == row.document_id) SESSION.execute(u) SESSION.flush() except Exception as error: # TODO: Handle specific errors. log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def login(self): """Load homepage, find login, enter credentials.""" self.load_homepage() # time.sleep(1.0) self.find_login_link() log.info('Sleep 1.0 second') time.sleep(1.0) self.enter_username() log.info('Sleep 1.0 second') time.sleep(1.0) self.enter_password() log.info('Sleep 5.0 seconds') time.sleep(5.0) try: self.driver.find_element_by_id("Header1_lnkLogout") log.info("Login successful") except Exception as error: log.info("Login failed") log.exception(error) raise
def parse_sale(self, j, rows, year, month, day): """Parse single sale page and save HTML.""" document_id = rows[j].string url = ('http://onlinerecords.orleanscivilclerk.com/RealEstate/' + 'SearchResults.aspx?global_id={}&type=dtl').format(document_id) try: log.info('Load sale URL {}'.format(url)) self.driver.get(url) except Exception: # TODO log.exception('Error loading sale URL {}'.format(url)) html = self.driver.page_source html_out = "{0}/data/raw/{1}-{2}-{3}/form-html/{4}.html".format( PROJECT_DIR, year, month, day, document_id) log.info('Save {}'.format(html_out)) with open(html_out, "wb") as f_out: f_out.write(html.encode('utf-8')) try: assert not self.is_error_page(html_out) # TODO: Read from memory except Exception: # TODO log.exception('Received error page') log.info('Deleting error page {}'.format(html_out)) os.remove(html_out)
def other_stuff_location_info(rows): """Run checks for location_info.""" for row in rows: # To remove district ordinal row['location_info'] = row['location_info'].replace('1st', '1') row['location_info'] = row['location_info'].replace('2nd', '2') row['location_info'] = row['location_info'].replace('3rd', '3') row['location_info'] = row['location_info'].replace('4th', '4') row['location_info'] = row['location_info'].replace('5th', '5') row['location_info'] = row['location_info'].replace('6th', '6') row['location_info'] = row['location_info'].replace('7th', '7') all_locations_text = '' list1 = row['location_info'].split(';') for i in list1: list2 = i.split(',') individiual_location_text = '' for j in list2: try: if j.strip()[-1] != ':': # If first addition: if individiual_location_text == '': individiual_location_text = j.strip() else: # If second addition or later individiual_location_text = ( individiual_location_text + ', ' + j.strip()) except Exception as error: log.exception(error, exc_info=True) continue if all_locations_text == '': if individiual_location_text != '': all_locations_text = individiual_location_text.strip() else: if individiual_location_text != '': all_locations_text = ( all_locations_text + '; ' + individiual_location_text.strip()) row['location_info'] = all_locations_text return rows
def check_geocoder_good_rating(self): """Check if PostGIS Geocoder rating scored 3 or lower: good.""" SESSION.query( Location.rating, Location.location_publish ).filter( (Location.rating == 'RANGE_INTERPOLATED') | (Location.rating == 'ROOFTOP') ).update({"location_publish": True}) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def check_geocoder_bad_rating(self): """Check if PostGIS Geocoder rating scored higher than 3: bad.""" SESSION.query( Location.rating, Location.location_publish ).filter( (Location.rating == 'GEOMETRIC_CENTER') | (Location.rating == 'APPROXIMATE') | (Location.rating.is_(None)) ).update({"location_publish": False}) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def __init__(self): """Create database.""" try: self._database_connection() except OperationalError as error: print(error) log.exception(error, exc_info=True) db_error = ('(psycopg2.OperationalError) FATAL: database "{}" ' + 'does not exist').format(DATABASE_NAME) if str(error).strip() == db_error: self.create_db() self._create_tables() self._import_neighorhoods() self._spatial_index_on_cleaned_geom() self.conn.close()
def other_stuff_addresses(rows): """Run checks for addresses.""" # log.debug(rows) for row in rows: all_addresses_text = '' address_list1 = row['address'].split(';') for i in address_list1: address_list2 = i.split(',') individual_address_text = '' for j in address_list2: try: # If first addition: if individual_address_text == '': individual_address_text = j.strip() else: # If second addition or later individual_address_text = ( individual_address_text + ', ' + j.strip()) except Exception as error: log.exception(error, exc_info=True) continue if all_addresses_text == '' and individual_address_text != '': all_addresses_text = individual_address_text.strip() elif individual_address_text != '': all_addresses_text = ( all_addresses_text + '; ' + individual_address_text.strip()) # location_info = location_info.replace(';', ',') # So can split on commas for both semi-colons and commas row['address'] = all_addresses_text return rows
def check_north_of_new_orleans(self): """Check if geocoded coords are within north border of New Orleans.""" # Lat less than 29.864543 is north of New Orleans: SESSION.query( Location.latitude, Location.location_publish ).filter( Location.latitude > 30.181719 ).update({ "location_publish": False }) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def check_west_of_new_orleans(self): """Check if geocoded coords are within west border of New Orleans.""" # Long less than -90.140388 is west of New Orleans: SESSION.query( Location.longitude, Location.location_publish ).filter( Location.longitude < -90.140388 ).update({ "location_publish": False }) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def check_high_amount(self): """Check if sale amount is unreasonably high (>= $20,000,000).""" # Anything over $20,000,000 wouldn't be impossible, but is rare SESSION.query( Detail.amount, Detail.detail_publish ).filter( Detail.amount >= 20000000 ).update({ "detail_publish": False }) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def check_low_amount(self): """Check if sale amount is unreasonably low (<= $0).""" # Not sure about these, so check them all for now to be safe SESSION.query( Detail.amount, Detail.detail_publish ).filter( Detail.amount <= 0 ).update({ "detail_publish": False }) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def commit_rows(self, rows): """Commit JOIN-ed rows to the cleaned table.""" log.debug('Committing %d rows', len(rows)) for count, row in enumerate(rows): log.debug("Row %d", count) try: with SESSION.begin_nested(): i = insert(Cleaned) i = i.values(row) SESSION.execute(i) SESSION.flush() except Exception as error: log.debug('count: %s', count) log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit() log.debug('%d rows committed', len(rows))
def main(self): """The main scrape method.""" try: self.login() except Exception: # TODO log.exception("Problem during login") slack.chat.post_message("#realestate", "Expired password @channel") self.driver.close() self.driver.quit() raise try: self.cycle_through_dates() except Exception: # TODO log.exception("") finally: self.logout() self.driver.close() self.driver.quit()
def __init__(self): """Create database.""" try: self._database_connection() except OperationalError as error: print(error) log.exception(error, exc_info=True) db_error = ( '(psycopg2.OperationalError) FATAL: database "{}" ' + 'does not exist').format(DATABASE_NAME) if str(error).strip() == db_error: self.create_db() self._create_tables() self._import_neighorhoods() self._spatial_index_on_cleaned_geom() self.conn.close()
def check_if_no_date(self): """Check if sale has a date.""" SESSION.query( Detail.document_date, Detail.document_recorded, Detail.detail_publish ).filter( (Detail.document_date is None) | (Detail.document_recorded is None) ).update( {"detail_publish": False} ) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def make_all_locations_publishable(self): """ Assume all sales are publishable. Set location_publish = 1. Then set to 0 if questionable data is found. """ # Assume publishable, then check for reasons not to publish. SESSION.query( Location.location_publish ).update({ "location_publish": True }) try: with SESSION.begin_nested(): SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit()
def other_stuff_addresses(rows): """Run checks for addresses.""" # log.debug(rows) for row in rows: all_addresses_text = '' address_list1 = row['address'].split(';') for i in address_list1: address_list2 = i.split(',') individual_address_text = '' for j in address_list2: try: # If first addition: if individual_address_text == '': individual_address_text = j.strip() else: # If second addition or later individual_address_text = ( individual_address_text + ', ' + j.strip()) except Exception as error: log.exception(error, exc_info=True) continue if all_addresses_text == '' and individual_address_text != '': all_addresses_text = individual_address_text.strip() elif individual_address_text != '': all_addresses_text = (all_addresses_text + '; ' + individual_address_text.strip()) # location_info = location_info.replace(';', ',') # So can split on commas for both semi-colons and commas row['address'] = all_addresses_text return rows
def check_relative_date(self): """Check if sale date is >6 months prior to the recorded date.""" # Convert date strings to date format new_initial_date = datetime.strptime( self.initial_date, '%Y-%m-%d').date() new_until_date = datetime.strptime( self.until_date, '%Y-%m-%d').date() current_date = new_initial_date # Evaluate "30 days ago" based on that particular day while current_date != new_until_date: # Update date range old_date = current_date - timedelta(days=180) previous_date = current_date - timedelta(days=1) # Copy datetime objects to date strings old_date_string = old_date.strftime('%Y-%m-%d') previous_date_string = previous_date.strftime('%Y-%m-%d') current_date_string = current_date.strftime('%Y-%m-%d') # For sales recorded on a given day, check if the document # date is unbelievable (too old or in the future) try: with SESSION.begin_nested(): SESSION.query( Detail.document_recorded, Detail.document_date, Detail.detail_publish ).filter( Detail.document_recorded == current_date_string ).filter( Detail.document_date < old_date_string ).update({"detail_publish": False}) SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() try: with SESSION.begin_nested(): SESSION.query( Detail.document_recorded, Detail.document_date, Detail.detail_publish ).filter( Detail.document_recorded == current_date_string ).filter( Detail.document_date > previous_date_string ).update({ "detail_publish": False }) SESSION.flush() except Exception as error: log.exception(error, exc_info=True) SESSION.rollback() SESSION.commit() current_date = current_date + timedelta(days=1)