Ejemplo n.º 1
0
    def geocode(self):
        """Update latitude, longitude, rating and ZIP in Locations table."""
        print('\nGeocoding...')

        null_rating_rows = self.get_rows_with_null_rating()

        for row in null_rating_rows:
            full_address = "{0} {1}, New Orleans, LA".format(
                row.street_number, row.address)

            result = self.gmaps.geocode(full_address)

            if len(result) == 0:
                log.info('No geocoding results for: {}'.format(full_address))

                # TODO: Need to also note failure so future geocoding scripts
                #   don't keep trying and failing on the same addresses.
                #   Possibly update Location's `rating` and/or Cleaned's
                #   `location_publish` fields.
                continue

            details = self.process_google_results(result)

            try:
                with SESSION.begin_nested():
                    u = update(Location)
                    u = u.values(details)
                    u = u.where(Location.document_id == row.document_id)
                    SESSION.execute(u)
                    SESSION.flush()
            except Exception as error:  # TODO: Handle specific errors.
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()
Ejemplo n.º 2
0
    def login(self):
        """Load homepage, find login, enter credentials."""
        self.load_homepage()
        # time.sleep(1.0)

        self.find_login_link()

        log.info('Sleep 1.0 second')
        time.sleep(1.0)

        self.enter_username()

        log.info('Sleep 1.0 second')
        time.sleep(1.0)

        self.enter_password()

        log.info('Sleep 5.0 seconds')
        time.sleep(5.0)

        try:
            self.driver.find_element_by_id("Header1_lnkLogout")
            log.info("Login successful")
        except Exception as error:
            log.info("Login failed")
            log.exception(error)
            raise
Ejemplo n.º 3
0
    def parse_sale(self, j, rows, year, month, day):
        """Parse single sale page and save HTML."""
        document_id = rows[j].string

        url = ('http://onlinerecords.orleanscivilclerk.com/RealEstate/' +
               'SearchResults.aspx?global_id={}&type=dtl').format(document_id)

        try:
            log.info('Load sale URL {}'.format(url))
            self.driver.get(url)
        except Exception:  # TODO
            log.exception('Error loading sale URL {}'.format(url))

        html = self.driver.page_source
        html_out = "{0}/data/raw/{1}-{2}-{3}/form-html/{4}.html".format(
            PROJECT_DIR, year, month, day, document_id)

        log.info('Save {}'.format(html_out))

        with open(html_out, "wb") as f_out:
            f_out.write(html.encode('utf-8'))

        try:
            assert not self.is_error_page(html_out)  # TODO: Read from memory
        except Exception:  # TODO
            log.exception('Received error page')

            log.info('Deleting error page {}'.format(html_out))
            os.remove(html_out)
Ejemplo n.º 4
0
    def other_stuff_location_info(rows):
        """Run checks for location_info."""
        for row in rows:
            # To remove district ordinal
            row['location_info'] = row['location_info'].replace('1st', '1')
            row['location_info'] = row['location_info'].replace('2nd', '2')
            row['location_info'] = row['location_info'].replace('3rd', '3')
            row['location_info'] = row['location_info'].replace('4th', '4')
            row['location_info'] = row['location_info'].replace('5th', '5')
            row['location_info'] = row['location_info'].replace('6th', '6')
            row['location_info'] = row['location_info'].replace('7th', '7')

            all_locations_text = ''

            list1 = row['location_info'].split(';')

            for i in list1:
                list2 = i.split(',')

                individiual_location_text = ''

                for j in list2:
                    try:
                        if j.strip()[-1] != ':':
                            # If first addition:
                            if individiual_location_text == '':
                                individiual_location_text = j.strip()
                            else:  # If second addition or later
                                individiual_location_text = (
                                    individiual_location_text +
                                    ', ' +
                                    j.strip())
                    except Exception as error:
                        log.exception(error, exc_info=True)
                        continue

                if all_locations_text == '':
                    if individiual_location_text != '':
                        all_locations_text = individiual_location_text.strip()
                else:
                    if individiual_location_text != '':
                        all_locations_text = (
                            all_locations_text +
                            '; ' +
                            individiual_location_text.strip())

            row['location_info'] = all_locations_text

        return rows
Ejemplo n.º 5
0
    def other_stuff_location_info(rows):
        """Run checks for location_info."""
        for row in rows:
            # To remove district ordinal
            row['location_info'] = row['location_info'].replace('1st', '1')
            row['location_info'] = row['location_info'].replace('2nd', '2')
            row['location_info'] = row['location_info'].replace('3rd', '3')
            row['location_info'] = row['location_info'].replace('4th', '4')
            row['location_info'] = row['location_info'].replace('5th', '5')
            row['location_info'] = row['location_info'].replace('6th', '6')
            row['location_info'] = row['location_info'].replace('7th', '7')

            all_locations_text = ''

            list1 = row['location_info'].split(';')

            for i in list1:
                list2 = i.split(',')

                individiual_location_text = ''

                for j in list2:
                    try:
                        if j.strip()[-1] != ':':
                            # If first addition:
                            if individiual_location_text == '':
                                individiual_location_text = j.strip()
                            else:  # If second addition or later
                                individiual_location_text = (
                                    individiual_location_text + ', ' +
                                    j.strip())
                    except Exception as error:
                        log.exception(error, exc_info=True)
                        continue

                if all_locations_text == '':
                    if individiual_location_text != '':
                        all_locations_text = individiual_location_text.strip()
                else:
                    if individiual_location_text != '':
                        all_locations_text = (
                            all_locations_text + '; ' +
                            individiual_location_text.strip())

            row['location_info'] = all_locations_text

        return rows
Ejemplo n.º 6
0
    def check_geocoder_good_rating(self):
        """Check if PostGIS Geocoder rating scored 3 or lower: good."""
        SESSION.query(
            Location.rating,
            Location.location_publish
        ).filter(
            (Location.rating == 'RANGE_INTERPOLATED') |
            (Location.rating == 'ROOFTOP')
        ).update({"location_publish": True})

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 7
0
    def check_geocoder_bad_rating(self):
        """Check if PostGIS Geocoder rating scored higher than 3: bad."""
        SESSION.query(
            Location.rating,
            Location.location_publish
        ).filter(
            (Location.rating == 'GEOMETRIC_CENTER') |
            (Location.rating == 'APPROXIMATE') |
            (Location.rating.is_(None))
        ).update({"location_publish": False})

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 8
0
    def __init__(self):
        """Create database."""
        try:
            self._database_connection()
        except OperationalError as error:
            print(error)
            log.exception(error, exc_info=True)

            db_error = ('(psycopg2.OperationalError) FATAL:  database "{}" ' +
                        'does not exist').format(DATABASE_NAME)

            if str(error).strip() == db_error:
                self.create_db()

        self._create_tables()
        self._import_neighorhoods()
        self._spatial_index_on_cleaned_geom()

        self.conn.close()
Ejemplo n.º 9
0
    def other_stuff_addresses(rows):
        """Run checks for addresses."""
        # log.debug(rows)

        for row in rows:
            all_addresses_text = ''

            address_list1 = row['address'].split(';')

            for i in address_list1:
                address_list2 = i.split(',')

                individual_address_text = ''

                for j in address_list2:
                    try:
                        # If first addition:
                        if individual_address_text == '':
                            individual_address_text = j.strip()
                        else:  # If second addition or later
                            individual_address_text = (
                                individual_address_text +
                                ', ' +
                                j.strip())
                    except Exception as error:
                        log.exception(error, exc_info=True)
                        continue

                if all_addresses_text == '' and individual_address_text != '':
                    all_addresses_text = individual_address_text.strip()
                elif individual_address_text != '':
                    all_addresses_text = (
                        all_addresses_text +
                        '; ' +
                        individual_address_text.strip())

            # location_info = location_info.replace(';', ',')
            # So can split on commas for both semi-colons and commas

            row['address'] = all_addresses_text

        return rows
Ejemplo n.º 10
0
    def check_north_of_new_orleans(self):
        """Check if geocoded coords are within north border of New Orleans."""
        # Lat less than 29.864543 is north of New Orleans:
        SESSION.query(
            Location.latitude,
            Location.location_publish
        ).filter(
            Location.latitude > 30.181719
        ).update({
            "location_publish": False
        })

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 11
0
    def check_west_of_new_orleans(self):
        """Check if geocoded coords are within west border of New Orleans."""
        # Long less than -90.140388 is west of New Orleans:
        SESSION.query(
            Location.longitude,
            Location.location_publish
        ).filter(
            Location.longitude < -90.140388
        ).update({
            "location_publish": False
        })

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 12
0
    def check_high_amount(self):
        """Check if sale amount is unreasonably high (>= $20,000,000)."""
        # Anything over $20,000,000 wouldn't be impossible, but is rare
        SESSION.query(
            Detail.amount,
            Detail.detail_publish
        ).filter(
            Detail.amount >= 20000000
        ).update({
            "detail_publish": False
        })

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 13
0
    def check_low_amount(self):
        """Check if sale amount is unreasonably low (<= $0)."""
        # Not sure about these, so check them all for now to be safe
        SESSION.query(
            Detail.amount,
            Detail.detail_publish
        ).filter(
            Detail.amount <= 0
        ).update({
            "detail_publish": False
        })

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 14
0
    def commit_rows(self, rows):
        """Commit JOIN-ed rows to the cleaned table."""
        log.debug('Committing %d rows', len(rows))

        for count, row in enumerate(rows):
            log.debug("Row %d", count)
            try:
                with SESSION.begin_nested():
                    i = insert(Cleaned)
                    i = i.values(row)
                    SESSION.execute(i)
                    SESSION.flush()
            except Exception as error:
                log.debug('count: %s', count)
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()

        log.debug('%d rows committed', len(rows))
Ejemplo n.º 15
0
    def main(self):
        """The main scrape method."""
        try:
            self.login()
        except Exception:  # TODO
            log.exception("Problem during login")
            slack.chat.post_message("#realestate", "Expired password @channel")

            self.driver.close()
            self.driver.quit()
            raise

        try:
            self.cycle_through_dates()
        except Exception:  # TODO
            log.exception("")
        finally:
            self.logout()
            self.driver.close()
            self.driver.quit()
Ejemplo n.º 16
0
    def __init__(self):
        """Create database."""
        try:
            self._database_connection()
        except OperationalError as error:
            print(error)
            log.exception(error, exc_info=True)

            db_error = (
                '(psycopg2.OperationalError) FATAL:  database "{}" ' +
                'does not exist').format(DATABASE_NAME)

            if str(error).strip() == db_error:
                self.create_db()

        self._create_tables()
        self._import_neighorhoods()
        self._spatial_index_on_cleaned_geom()

        self.conn.close()
Ejemplo n.º 17
0
    def commit_rows(self, rows):
        """Commit JOIN-ed rows to the cleaned table."""
        log.debug('Committing %d rows', len(rows))

        for count, row in enumerate(rows):
            log.debug("Row %d", count)
            try:
                with SESSION.begin_nested():
                    i = insert(Cleaned)
                    i = i.values(row)
                    SESSION.execute(i)
                    SESSION.flush()
            except Exception as error:
                log.debug('count: %s', count)
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()

        log.debug('%d rows committed', len(rows))
Ejemplo n.º 18
0
    def check_if_no_date(self):
        """Check if sale has a date."""
        SESSION.query(
            Detail.document_date,
            Detail.document_recorded,
            Detail.detail_publish
        ).filter(
            (Detail.document_date is None) |
            (Detail.document_recorded is None)
        ).update(
            {"detail_publish": False}
        )

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 19
0
    def make_all_locations_publishable(self):
        """
        Assume all sales are publishable.

        Set location_publish = 1. Then set to 0 if questionable data is  found.
        """
        # Assume publishable, then check for reasons not to publish.
        SESSION.query(
            Location.location_publish
        ).update({
            "location_publish": True
        })

        try:
            with SESSION.begin_nested():
                SESSION.flush()
        except Exception as error:
            log.exception(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()
Ejemplo n.º 20
0
    def other_stuff_addresses(rows):
        """Run checks for addresses."""
        # log.debug(rows)

        for row in rows:
            all_addresses_text = ''

            address_list1 = row['address'].split(';')

            for i in address_list1:
                address_list2 = i.split(',')

                individual_address_text = ''

                for j in address_list2:
                    try:
                        # If first addition:
                        if individual_address_text == '':
                            individual_address_text = j.strip()
                        else:  # If second addition or later
                            individual_address_text = (
                                individual_address_text + ', ' + j.strip())
                    except Exception as error:
                        log.exception(error, exc_info=True)
                        continue

                if all_addresses_text == '' and individual_address_text != '':
                    all_addresses_text = individual_address_text.strip()
                elif individual_address_text != '':
                    all_addresses_text = (all_addresses_text + '; ' +
                                          individual_address_text.strip())

            # location_info = location_info.replace(';', ',')
            # So can split on commas for both semi-colons and commas

            row['address'] = all_addresses_text

        return rows
Ejemplo n.º 21
0
    def check_relative_date(self):
        """Check if sale date is >6 months prior to the recorded date."""
        # Convert date strings to date format
        new_initial_date = datetime.strptime(
            self.initial_date, '%Y-%m-%d').date()
        new_until_date = datetime.strptime(
            self.until_date, '%Y-%m-%d').date()
        current_date = new_initial_date

        # Evaluate "30 days ago" based on that particular day
        while current_date != new_until_date:
            # Update date range
            old_date = current_date - timedelta(days=180)
            previous_date = current_date - timedelta(days=1)

            # Copy datetime objects to date strings
            old_date_string = old_date.strftime('%Y-%m-%d')
            previous_date_string = previous_date.strftime('%Y-%m-%d')
            current_date_string = current_date.strftime('%Y-%m-%d')

            # For sales recorded on a given day, check if the document
            # date is unbelievable (too old or in the future)

            try:
                with SESSION.begin_nested():
                    SESSION.query(
                        Detail.document_recorded,
                        Detail.document_date,
                        Detail.detail_publish
                    ).filter(
                        Detail.document_recorded == current_date_string
                    ).filter(
                        Detail.document_date < old_date_string
                    ).update({"detail_publish": False})

                    SESSION.flush()
            except Exception as error:
                log.exception(error, exc_info=True)
                SESSION.rollback()

            try:
                with SESSION.begin_nested():
                    SESSION.query(
                        Detail.document_recorded,
                        Detail.document_date,
                        Detail.detail_publish
                    ).filter(
                        Detail.document_recorded == current_date_string
                    ).filter(
                        Detail.document_date > previous_date_string
                    ).update({
                        "detail_publish": False
                    })

                    SESSION.flush()
            except Exception as error:
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()

            current_date = current_date + timedelta(days=1)