Beispiel #1
0
    def process_google_results(self, result):
        """
        Get values from the geocoding results.

        https://developers.google.com/maps/documentation/geocoding/
            intro#GeocodingResponses

        :param result: Results from Google Geocoding API ("results" list only).
        :type result: list
        :returns: This location's rating, latitude, longitude and ZIP code.
        :rtype: dict
        """
        # TODO: Handle more than one returned location in result.
        #   Could compare accuracies and use that to decide which to store.
        loc = result[0]

        values = {
            'latitude': loc['geometry']['location']['lat'],
            'longitude': loc['geometry']['location']['lng'],
            'rating': loc['geometry']['location_type']}

        try:
            values['zip_code'] = loc['address_components'][7]['short_name']
        except Exception:  # TODO: More specific error.
            log.info("No zip code.")
            values['zip_code'] = "None"  # TODO: Leave blank instead?

        return values
Beispiel #2
0
    def login(self):
        """Load homepage, find login, enter credentials."""
        self.load_homepage()
        # time.sleep(1.0)

        self.find_login_link()

        log.info('Sleep 1.0 second')
        time.sleep(1.0)

        self.enter_username()

        log.info('Sleep 1.0 second')
        time.sleep(1.0)

        self.enter_password()

        log.info('Sleep 5.0 seconds')
        time.sleep(5.0)

        try:
            self.driver.find_element_by_id("Header1_lnkLogout")
            log.info("Login successful")
        except Exception as error:
            log.info("Login failed")
            log.exception(error)
            raise
Beispiel #3
0
    def geocode(self):
        """Update latitude, longitude, rating and ZIP in Locations table."""
        print('\nGeocoding...')

        null_rating_rows = self.get_rows_with_null_rating()

        for row in null_rating_rows:
            full_address = "{0} {1}, New Orleans, LA".format(
                row.street_number, row.address)

            result = self.gmaps.geocode(full_address)

            if len(result) == 0:
                log.info('No geocoding results for: {}'.format(full_address))

                # TODO: Need to also note failure so future geocoding scripts
                #   don't keep trying and failing on the same addresses.
                #   Possibly update Location's `rating` and/or Cleaned's
                #   `location_publish` fields.
                continue

            details = self.process_google_results(result)

            try:
                with SESSION.begin_nested():
                    u = update(Location)
                    u = u.values(details)
                    u = u.where(Location.document_id == row.document_id)
                    SESSION.execute(u)
                    SESSION.flush()
            except Exception as error:  # TODO: Handle specific errors.
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()
Beispiel #4
0
def cli(arguments):
    """Parse command-line arguments."""
    # Catch any missed errors.
    if cli_has_errors(arguments):
        return

    if arguments['<single_date>']:  # Single date
        early_date = arguments['<single_date>']
        late_date = arguments['<single_date>']

        log.info('Initializing single date: {}.'.format(early_date))
    elif arguments['<early_date>'] and arguments['<late_date>']:  # Date range
        early_date = arguments['<early_date>']
        late_date = arguments['<late_date>']

        log.info('Initializing date range: {0} to {1}.'.format(
            early_date, late_date))

    # Check for errors
    early_datetime = datetime.strptime(early_date, "%Y-%m-%d")
    late_datetime = datetime.strptime(late_date, "%Y-%m-%d")

    if early_datetime > late_datetime:
        raise BadDateRangeError("The date range does not make sense.")

    DeleteDates(initial_date=early_date, until_date=late_date).main()
    def rebuild_days(early_date, late_date):
        """Scrapes and initializes dates."""

        print(early_date, late_date)

        # Build those newly scraped records.
        # This will set perm_flag = True in
        # checkPermanentStatusOfNewSales().
        log.info('doitall')
Beispiel #6
0
def home():
    """Receive a GET call for the homepage (/) and returns the view."""
    data = Models().get_home()
    log.info(data)

    view = Views().get_home(data)
    log.info(view)

    return view
Beispiel #7
0
    def rebuild_days(early_date, late_date):
        """Scrapes and initializes dates."""

        print(early_date, late_date)

        # Build those newly scraped records.
        # This will set perm_flag = True in
        # checkPermanentStatusOfNewSales().
        log.info('doitall')
Beispiel #8
0
    def delete_permanent_date_range_file():
        """Delete old most-recent-permanent-date-range/*.html."""
        # Delete old file first
        log.info('Delete old most-recent-permanent-date-range/*.html file')

        file_string = "{}/data/most-recent-permanent-date-range/*.html".format(
            PROJECT_DIR)
        for file_path in glob.glob(file_string):
            os.remove(file_path)
Beispiel #9
0
    def click_advanced_tab(self):
        """Click on the advanced tab."""
        html_id = 'x:2130005445.2:mkr:ti1'

        log.info('Find advanced tab at HTML ID {}'.format(html_id))
        advanced_tab_elem = self.driver.find_element_by_id(html_id)

        log.info('Click on advanced tab')
        advanced_tab_elem.click()
Beispiel #10
0
    def click_search_button(self):
        """Click on the search button."""
        html_id = 'cphNoMargin_SearchButtons2_btnSearch__1'

        log.info('Find search button at HTML ID {}'.format(html_id))
        search_button_elem = self.driver.find_element_by_id(html_id)

        log.info('Click search button')
        search_button_elem.click()
Beispiel #11
0
    def enter_username(self):
        """Type in username."""
        html_id = 'Header1_txtLogonName'

        log.info('Find username field at HTML ID {}'.format(html_id))
        username_elem = self.driver.find_element_by_id(html_id)

        log.info('Enter username from environment variable')
        username_elem.send_keys(os.environ.get('REAL_ESTATE_LRD_USERNAME'))
Beispiel #12
0
    def find_login_link(self):
        """Find and click on login link."""
        html_id = 'Header1_lnkLogin'

        log.info('Find login link at HTML ID {}'.format(html_id))
        login_link_elem = self.driver.find_element_by_id(html_id)

        log.info('Click login link')
        login_link_elem.click()
Beispiel #13
0
    def delete_permanent_date_range_when_scraped_file(year, month, day):
        """Delete old permanent-date-range-when-scraped*.html."""
        log.info('Delete old permanent-date-range-when-scraped*.html')

        string = ("{0}/data/raw/{1}-{2}-{3}/" +
                  "permanent-date-range-when-scraped*.html").format(
                      PROJECT_DIR, year, month, day)

        for file_path in glob.glob(string):
            os.remove(file_path)
Beispiel #14
0
def page_not_found(error):
    """
    Return an error page.

    :param error: The error message(?).
    :type error: not sure
    :returns: The view.
    """
    log.info(error)

    view = Views().get_error_page()
    return view
Beispiel #15
0
 def save_permanent_date_range_when_scraped_file(year, month, day,
                                                 date_range_html,
                                                 first_date, second_date):
     """Save new permanent-date-range-when-scraped*.html."""
     # Save permanent date range for this individual sale.
     log.info('Save new permanent-date-range-when-scraped*.html file')
     individual_html_out = open(
         ("{0}/data/raw/{1}-{2}-{3}/" +
          "permanent-date-range-when-scraped_{4}-{5}.html").format(
              PROJECT_DIR, year, month, day, first_date, second_date), "wb")
     individual_html_out.write(date_range_html.encode('utf-8'))
     individual_html_out.close()
Beispiel #16
0
    def find_permanent_date_range(self):
        """Parse search page for permanent date range."""
        html_id = 'cphNoMargin_lblSearchSummary'

        log.info('Find permanent date range at HTML ID {}'.format(html_id))
        date_range_elem = self.driver.find_element_by_id(html_id)

        match = re.match(r"Permanent Index From ([0-9/]*) to ([0-9/]*)",
                         date_range_elem.text)

        first_date = match.group(1).replace('/', '')  # 02/18/2014
        second_date = match.group(2).replace('/', '')

        return first_date, second_date
Beispiel #17
0
    def save_permanent_date_range_file(date_range_html, first_date,
                                       second_date):
        """Save new most-recent-permanent-date-range/*.html."""
        log.info('Save new most-recent-permanent-date-range/*.html file')

        fn = "{0}/data/most-recent-permanent-date-range/{1}-{2}.html".format(
            PROJECT_DIR, first_date, second_date)

        if not os.path.exists(os.path.dirname(fn)):
            os.makedirs(os.path.dirname(fn))

        overall_html_out = open(fn, "wb")
        overall_html_out.write(date_range_html.encode('utf-8'))
        overall_html_out.close()
Beispiel #18
0
    def parse_results(self, year, month, day):
        """Parse initial result page for total number of sales."""
        html_id = 'cphNoMargin_cphNoMargin_OptionsBar1_ItemList'

        try:
            log.info('Find results list at HTML ID {}'.format(html_id))
            item_list_elem = self.driver.find_element_by_id(html_id)

            # log.info('Find option')
            options = item_list_elem.find_elements_by_tag_name("option")
        except Exception as error:
            log.info('No sales for this day')
            log.error(error, exc_info=True)

            html_out = '{}/data/raw/{}-{}-{}/page-html/page1.html'.format(
                PROJECT_DIR, year, month, day)

            with open(html_out, 'wb') as f_out:
                f_out.write((self.driver.page_source).encode('utf-8'))

            return

        total_pages = int(options[-1].get_attribute('value'))
        log.info('{0} pages of records for {1}-{2}-{3}'.format(
            total_pages, year, month, day))

        for i in range(1, total_pages + 1):
            self.parse_page(i, year, month, day)

            log.info('Sleep 5.0 seconds')
            time.sleep(5.0)
Beispiel #19
0
    def search_parameters(self, search_date):
        """Enter search parameters."""
        self.click_advanced_tab()
        time.sleep(2.0)

        self.enter_date_filed_from(search_date)
        self.enter_date_filed_to(search_date)

        self.select_document_type()
        time.sleep(1.0)

        self.click_search_button()

        log.info('Sleep 5.0 seconds')
        time.sleep(5.0)
Beispiel #20
0
    def scrape_days(early_date, late_date):
        """docstring"""

        early_datetime = datetime.strptime(early_date, '%Y-%m-%d')
        log.debug(early_datetime)
        late_datetime = datetime.strptime(late_date, '%Y-%m-%d')
        log.debug(early_datetime)

        # Scrape those days over again
        log.info('scrape')
        try:
            Scrape(initial_date=early_datetime,
                   until_date=late_datetime).main()
        except Exception as error:
            log.error(error, exc_info=True)
Beispiel #21
0
    def select_document_type(self):
        """Select SALE document type in dropdown."""
        html_id = 'cphNoMargin_f_dclDocType_297'  # SALE

        # TODO: Assert text is SALE
        log.info('Find document type SALE at HTML ID {}'.format(html_id))
        doc_type_elem = self.driver.find_element_by_id(html_id)

        short_type = doc_type_elem.get_attribute('value')

        parent_elem = doc_type_elem.find_element_by_xpath('..')
        long_type = parent_elem.find_element_by_tag_name('label').text

        log.info('Document type is {} ({})'.format(long_type, short_type))
        doc_type_elem.click()
Beispiel #22
0
    def get_home(self):
        """
        Get data for the homepage (/realestate/).

        :returns: Data for the homepage, such as date the app was last updated
        and a list of neighborhoods for the dropdown.
        """
        update_date = self.get_last_updated_date()
        log.info(update_date)

        neighborhoods = self.get_neighborhoods()

        data = {'update_date': update_date, 'neighborhoods': neighborhoods}
        log.info(data)

        return data
Beispiel #23
0
    def scrape_days(early_date, late_date):
        """docstring"""

        early_datetime = datetime.strptime(early_date, '%Y-%m-%d')
        log.debug(early_datetime)
        late_datetime = datetime.strptime(late_date, '%Y-%m-%d')
        log.debug(early_datetime)

        # Scrape those days over again
        log.info('scrape')
        try:
            Scrape(
                initial_date=early_datetime,
                until_date=late_datetime
            ).main()
        except Exception as error:
            log.error(error, exc_info=True)
Beispiel #24
0
    def get_last_updated_date(self):
        """TODO."""
        query = SESSION.query(Cleaned).filter(
            Cleaned.detail_publish.is_(True)).order_by(
                desc(Cleaned.document_recorded)).limit(1).all()

        log.info(query)

        updated_date = ''

        for row in query:
            updated_date = ymd_to_full_date(
                (row.document_recorded).strftime('%Y-%m-%d'), no_day=True)

        log.info(updated_date)

        SESSION.close()

        return updated_date
Beispiel #25
0
    def parse_page(self, i, year, month, day):
        """Parse results page for sale document IDs."""
        # Save table page
        log.info('Parse page {0} for {1}-{2}-{3}'.format(
            i,
            year,
            month,
            day,
        ))

        html_out = '{}/data/raw/{}-{}-{}/page-html/page{}.html'.format(
            PROJECT_DIR, year, month, day, i)

        with open(html_out, 'wb') as f_out:
            f_out.write((self.driver.page_source).encode('utf-8'))

        # TODO: Read from memory instead of new output file
        soup = BeautifulSoup(open(html_out), "html.parser")

        # log.info('Find all object IDs')

        # For this one page
        rows = soup.find_all('td', class_="igede12b9e")  # List of Object IDs

        # First table row is empty
        log.info('{} records to scrape for this page'.format(len(rows) - 1))

        for j in range(1, len(rows)):
            # overall_row = (i - 1) * 20 + j
            self.parse_sale(j, rows, year, month, day)

        url = 'http://onlinerecords.orleanscivilclerk.com/RealEstate/' + \
              'SearchResults.aspx'
        log.info('Load URL {}'.format(url))
        self.driver.get(url)

        html_id = 'OptionsBar1_imgNext'
        log.info('Find next page button at HTML ID {}'.format(html_id))
        next_button_elem = self.driver.find_element_by_id(html_id)

        log.info('Click next page button')
        next_button_elem.click()
Beispiel #26
0
    def main(self):
        """Run Join() and Clean() scripts."""
        log.info('Clean')
        print('Cleaning...')

        log.debug('get_rows_from_query')
        rows = Join(initial_date=self.initial_date,
                    until_date=self.until_date).get_rows_from_query()

        log.debug('add_location_fields_temp_hack')
        rows = Join(
            initial_date=self.initial_date,
            until_date=self.until_date).add_location_fields_temp_hack(rows)

        log.debug('len(rows): %d', len(rows))

        prepped_rows = self.prep_rows(rows)

        clean_rows = self.clean_rows(prepped_rows)

        self.commit_rows(clean_rows)
Beispiel #27
0
def cli(arguments):
    """Parse command-line arguments."""
    # Catch any missed errors
    if cli_has_errors(arguments):
        return

    if arguments['<single_date>']:  # Single date
        early_date = arguments['<single_date>']
        late_date = arguments['<single_date>']

        log.info('Initializing single date: {}.'.format(early_date))
    elif arguments['<early_date>'] and arguments['<late_date>']:  # Date range
        early_date = arguments['<early_date>']
        late_date = arguments['<late_date>']

        log.info('Initializing date range: {0} to {1}.'.format(
            early_date, late_date))
    else:  # No dates provided
        log.info('Initializing all dates that need it.')

        initialize()  # Default: initialize all in need.
        return

    # Check for errors
    early_datetime = datetime.strptime(early_date, "%Y-%m-%d")
    late_datetime = datetime.strptime(late_date, "%Y-%m-%d")

    if early_datetime > late_datetime:
        raise BadDateRangeError("The date range does not make sense.")

    initialize(initial_date=early_date, until_date=late_date)
Beispiel #28
0
def cli(arguments):
    """Parse command-line arguments."""
    # Catch any missed errors
    if cli_has_errors(arguments):
        return

    if arguments['<single_date>']:  # Single date
        early_date = arguments['<single_date>']
        late_date = arguments['<single_date>']

        log.info('Initializing single date: {}.'.format(early_date))
    elif arguments['<early_date>'] and arguments['<late_date>']:  # Date range
        early_date = arguments['<early_date>']
        late_date = arguments['<late_date>']

        log.info('Initializing date range: {0} to {1}.'.format(
            early_date, late_date))
    else:  # No dates provided
        log.info('Initializing all dates that need it.')

        initialize()  # Default: initialize all in need.
        return

    # Check for errors
    early_datetime = datetime.strptime(early_date, "%Y-%m-%d")
    late_datetime = datetime.strptime(late_date, "%Y-%m-%d")

    if early_datetime > late_datetime:
        raise BadDateRangeError("The date range does not make sense.")

    initialize(initial_date=early_date, until_date=late_date)
Beispiel #29
0
    def parse_sale(self, j, rows, year, month, day):
        """Parse single sale page and save HTML."""
        document_id = rows[j].string

        url = ('http://onlinerecords.orleanscivilclerk.com/RealEstate/' +
               'SearchResults.aspx?global_id={}&type=dtl').format(document_id)

        try:
            log.info('Load sale URL {}'.format(url))
            self.driver.get(url)
        except Exception:  # TODO
            log.exception('Error loading sale URL {}'.format(url))

        html = self.driver.page_source
        html_out = "{0}/data/raw/{1}-{2}-{3}/form-html/{4}.html".format(
            PROJECT_DIR, year, month, day, document_id)

        log.info('Save {}'.format(html_out))

        with open(html_out, "wb") as f_out:
            f_out.write(html.encode('utf-8'))

        try:
            assert not self.is_error_page(html_out)  # TODO: Read from memory
        except Exception:  # TODO
            log.exception('Received error page')

            log.info('Deleting error page {}'.format(html_out))
            os.remove(html_out)
Beispiel #30
0
def cli(arguments):
    """Parse command-line arguments."""
    if cli_has_errors(arguments):
        return

    if arguments['<single_date>']:
        early_date = arguments['<single_date>']
        late_date = arguments['<single_date>']

        log.info('Scraping single date: {}'.format(early_date))
    elif arguments['<early_date>'] and arguments['<late_date>']:
        early_date = arguments['<early_date>']
        late_date = arguments['<late_date>']

        log.info('Scraping date range: {0} to {1}'.format(
            early_date, late_date))
    else:  # No dates provided. Default is to scrape previous day.
        log.info('Scraping yesterday')

        Scrape().main()
        return

    # Check for errors
    early_datetime = datetime.strptime(early_date, '%Y-%m-%d')
    late_datetime = datetime.strptime(late_date, '%Y-%m-%d')

    if early_datetime > late_datetime:
        raise BadDateRangeError('Bad date range')

    Scrape(initial_date=early_date, until_date=late_date).main()
Beispiel #31
0
    def main(self):
        """Run Join() and Clean() scripts."""
        log.info('Clean')
        print('Cleaning...')

        log.debug('get_rows_from_query')
        rows = Join(
            initial_date=self.initial_date,
            until_date=self.until_date
        ).get_rows_from_query()

        log.debug('add_location_fields_temp_hack')
        rows = Join(
            initial_date=self.initial_date,
            until_date=self.until_date
        ).add_location_fields_temp_hack(rows)

        log.debug('len(rows): %d', len(rows))

        prepped_rows = self.prep_rows(rows)

        clean_rows = self.clean_rows(prepped_rows)

        self.commit_rows(clean_rows)
Beispiel #32
0
    def enter_date_filed_from(self, search_date):
        """Enter "date from"."""
        html_id = 'x:1221134975.0:mkr:3'

        log.info('Find "date filed from" field at HTML ID {}'.format(html_id))
        date_file_from_elem = self.driver.find_element_by_id(html_id)

        log.info('Click on "date filed from" field')
        date_file_from_elem.click()

        log.info('Enter {} into "date filed from" field'.format(search_date))
        date_file_from_elem.send_keys(search_date)
Beispiel #33
0
    def enter_date_filed_to(self, search_date):
        """Enter "date to"."""
        html_id = 'x:96043147.0:mkr:3'

        log.info('Find "date filed to" field at HTML ID {}'.format(html_id))
        date_file_to_elem = self.driver.find_element_by_id(html_id)

        log.info('Click on "date filed to" field')
        date_file_to_elem.click()

        log.info('Enter {} into "date filed to" field'.format(search_date))
        date_file_to_elem.send_keys(search_date)
Beispiel #34
0
    def enter_password(self):
        """Type in password."""
        html_id = 'Header1_txtPassword'

        log.info('Find password field at HTML ID {}'.format(html_id))
        password_elem = self.driver.find_element_by_id(html_id)

        log.info('Enter password from environment variable')
        password_elem.send_keys(os.environ.get('REAL_ESTATE_LRD_PASSWORD'))

        log.info('Press enter to submit credentials and log in')
        # Trigger search function. Don't use RETURN because PhantomJS fails.
        password_elem.send_keys(Keys.ENTER)
Beispiel #35
0
    def logout(self):
        """Logout of site."""
        url = 'http://onlinerecords.orleanscivilclerk.com/RealEstate/' + \
              'SearchEntry.aspx'
        # No matter which page you're on, you can go back here and logout.
        log.info('Load {}'.format(url))
        self.driver.get(url)

        html_id = 'Header1_lnkLogout'

        log.info('Find logout button at HTML ID {}'.format(html_id))
        logout_elem = self.driver.find_element_by_id(html_id)

        log.info('Click logout button')
        logout_elem.click()
Beispiel #36
0
    def cycle_through_dates(self):
        """For each date in range, search, parse results and save HTML.

        TODO: Make this asynchronous.
        """
        current_date = self.initial_date

        # Must search each date one at a time because there is a limit of
        # 300 results per search. A single day shouldn't reach that ceiling.
        while current_date != (self.until_date + timedelta(days=1)):
            year = current_date.strftime('%Y')  # "2014"
            month = current_date.strftime('%m')  # "09"
            day = current_date.strftime('%d')  # "09"

            log.info('Search records for {}-{}-{}'.format(year, month, day))

            # Check if folder for this day exists. If not, then make one.
            pagedir = "{0}/data/raw/{1}-{2}-{3}/page-html".format(
                PROJECT_DIR, year, month, day)

            formdir = "{0}/data/raw/{1}-{2}-{3}/form-html".format(
                PROJECT_DIR, year, month, day)

            if not os.path.exists(pagedir):
                log.info('Create directory {}'.format(pagedir))
                os.makedirs(pagedir)
            if not os.path.exists(formdir):
                log.info('Create directory {}'.format(formdir))
                os.makedirs(formdir)

            search_date = '{}{}{}'.format(month, day, year)

            # The meat of this loop
            self.navigate_search_page(year, month, day)

            self.search_parameters(search_date)
            self.parse_results(year, month, day)

            current_date += timedelta(days=1)