Example #1
0
    def check_permanent_status_of_new_sales(self):
        """
        Examine first-time sales and assign True or False for permanent_flag.
        """
        log.debug('Check permanent status of new sales')

        # TODO: Is this function called for sales that have already been given
        # a False flag? Need to change that if so, because this only looks
        # for sales with no flag. Could change to check for None or False.

        # Get dates to inspect
        earliest_datetime = self.earliest_date_no_flag()
        latest_datetime = self.latest_date_no_flag()

        # For all folders (dates)
        while earliest_datetime != (latest_datetime + timedelta(days=1)):
            current_iteration_date = earliest_datetime.strftime('%Y-%m-%d')

            early_permanent_datetime = self.find_early_perm_date_when_scraped(
                current_iteration_date)

            late_permanent_datetime = self.find_late_perm_date_when_scraped(
                current_iteration_date)

            # For this date that is currently considered temporary (whether by
            # default or because it was previously confirmed to be temporary),
            # check on the permanent date range at the time of the scrape.

            self.update_this_dates_permanent_flag(earliest_datetime,
                                                  early_permanent_datetime,
                                                  late_permanent_datetime)

            earliest_datetime += timedelta(days=1)
Example #2
0
    def create_db(self):
        """Create database."""
        log.debug('create_db')

        call(['createdb', DATABASE_NAME])  # Create database
        self._database_connection()  # Connect to database
        self._add_db_extensions()  # Add Postgres extensions
Example #3
0
    def __init__(self, initial_date=None, until_date=None):
        """Initialize self variables and establish connection to database."""
        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: %s', self.initial_date)
        log.debug('self.until_date: %s', self.until_date)
Example #4
0
    def __init__(self, initial_date=None, until_date=None):
        """Create class variables for date range and connect to database."""
        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: {}'.format(self.initial_date))
        log.debug('self.until_date: {}'.format(self.until_date))
Example #5
0
    def __init__(self, initial_date=None, until_date=None):
        """Create class variables for date range and connect to database."""
        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: {}'.format(self.initial_date))
        log.debug('self.until_date: {}'.format(self.until_date))
Example #6
0
    def list_parse(self, parser_name, table):
        """
        Parse data structured as a list of dicts.

        This is how `locations`, `vendees` and `vendors` returns.
        """
        initial_datetime = datetime.strptime(
            self.initial_date, '%Y-%m-%d').date()
        until_datetime = datetime.strptime(self.until_date, '%Y-%m-%d').date()

        while initial_datetime != (until_datetime + timedelta(days=1)):
            current_date = initial_datetime.strftime('%Y-%m-%d')

            log.debug('Current date: {}'.format(current_date))
            print(current_date)

            glob_string = '{0}/data/raw/{1}/form-html/*.html'.format(
                PROJECT_DIR, current_date)

            for filepath in sorted(glob.glob(glob_string)):
                list_output = getattr(parse, parser_name)(filepath).form_list()

                # Because output might have multiple rows:
                for output in list_output:
                    self.commit_to_database(table, output)

            initial_datetime += timedelta(days=1)
Example #7
0
def initialize(initial_date=None, until_date=None):
    """Build, geocode, clean and publish records to the cleaned table."""
    if initial_date is None or until_date is None:
        date_range = GetDates().get_date_range()
        initial_date = date_range['initial_date']
        until_date = date_range['until_date']

    log.debug('self.initial_date: {}'.format(initial_date))
    log.debug('self.until_date: {}'.format(until_date))

    # try:  # TODO
    Build(initial_date=initial_date, until_date=until_date).build_all()

    # except Exception as error:
    #     log.exception(error, exc_info=True)

    # Geocoding takes over an hour
    Geocode(initial_date=initial_date, until_date=until_date).geocode()
    Geocode().update_locations_with_neighborhoods()

    # try:  # TODO
    Publish(initial_date=initial_date, until_date=until_date).main()
    # except Exception as error:
    #     log.exception(error, exc_info=True)

    # try:  # TODO
    Clean(initial_date=initial_date, until_date=until_date).main()
    # except Exception as error:
    #     log.exception(error, exc_info=True)

    Clean(initial_date=initial_date,
          until_date=until_date).update_cleaned_geom()
Example #8
0
    def check_permanent_status_of_temp_sales(self):
        """
        Compare latest permanent date range to the range of sales in our
        database that are labeled "temporary." If any of those temporary sales
        now fall within the permanent range, re-scrape and re-initialize.
        """
        log.debug('Check permanent status of temporary sales')

        # Don't need to know temporary end date or permanent start date.
        # Only need to know temporary start date and permanent end date
        # to determine the dates that were temporary but are now permanent.
        # See find_date_range_to_rescrape_and_initialize() for logic.

        earliest_temp_datetime = self.earliest_date_temp_flag()

        if earliest_temp_datetime is None:  # No temporary sales
            return

        global_permanent_range_last_datetime = self.latest_permanent_datetime()

        dates_to_redo = self.find_newly_permanent_date_range(
            earliest_temp_datetime,
            global_permanent_range_last_datetime)

        if dates_to_redo is not None:
            self.scrape_days(dates_to_redo[0], dates_to_redo[1])
Example #9
0
    def get_existing_until_date(self):
        """TODO: Docstring."""
        query_until_date = SESSION.query(Detail.document_recorded).order_by(
            Detail.document_recorded.desc()).limit(1).all()

        # Check if any records at all
        if len(query_until_date) == 0:
            # Make it so initialized date range will start from beginning.
            until_date = OPENING_DATE - timedelta(days=1)
            # log.debug(until_date)
            # log.debug(type(until_date))
        else:
            log.debug(len(query_until_date))
            for row in query_until_date:
                # TODO: will this fail w/o .one()?
                until_date = row.document_recorded

            # log.debug(until_date)
            # log.debug(type(until_date))

        # log.debug(until_date)

        SESSION.close()

        return until_date
Example #10
0
    def list_parse(self, parser_name, table):
        """
        Parse data structured as a list of dicts.

        This is how `locations`, `vendees` and `vendors` returns.
        """
        initial_datetime = datetime.strptime(self.initial_date,
                                             '%Y-%m-%d').date()
        until_datetime = datetime.strptime(self.until_date, '%Y-%m-%d').date()

        while initial_datetime != (until_datetime + timedelta(days=1)):
            current_date = initial_datetime.strftime('%Y-%m-%d')

            log.debug('Current date: {}'.format(current_date))
            print(current_date)

            glob_string = '{0}/data/raw/{1}/form-html/*.html'.format(
                PROJECT_DIR, current_date)

            for filepath in sorted(glob.glob(glob_string)):
                list_output = getattr(parse, parser_name)(filepath).form_list()

                # Because output might have multiple rows:
                for output in list_output:
                    self.commit_to_database(table, output)

            initial_datetime += timedelta(days=1)
Example #11
0
def initialize(initial_date=None, until_date=None):
    """Build, geocode, clean and publish records to the cleaned table."""
    if initial_date is None or until_date is None:
        date_range = GetDates().get_date_range()
        initial_date = date_range['initial_date']
        until_date = date_range['until_date']

    log.debug('self.initial_date: {}'.format(initial_date))
    log.debug('self.until_date: {}'.format(until_date))

    # try:  # TODO
    Build(initial_date=initial_date, until_date=until_date).build_all()

    # except Exception as error:
    #     log.exception(error, exc_info=True)

    # Geocoding takes over an hour
    Geocode(initial_date=initial_date, until_date=until_date).geocode()
    Geocode().update_locations_with_neighborhoods()

    # try:  # TODO
    Publish(initial_date=initial_date, until_date=until_date).main()
    # except Exception as error:
    #     log.exception(error, exc_info=True)

    # try:  # TODO
    Clean(initial_date=initial_date, until_date=until_date).main()
    # except Exception as error:
    #     log.exception(error, exc_info=True)

    Clean(
        initial_date=initial_date,
        until_date=until_date
    ).update_cleaned_geom()
Example #12
0
    def check_permanent_status_of_temp_sales(self):
        """
        Compare latest permanent date range to the range of sales in our
        database that are labeled "temporary." If any of those temporary sales
        now fall within the permanent range, re-scrape and re-initialize.
        """
        log.debug('Check permanent status of temporary sales')

        # Don't need to know temporary end date or permanent start date.
        # Only need to know temporary start date and permanent end date
        # to determine the dates that were temporary but are now permanent.
        # See find_date_range_to_rescrape_and_initialize() for logic.

        earliest_temp_datetime = self.earliest_date_temp_flag()

        if earliest_temp_datetime is None:  # No temporary sales
            return

        global_permanent_range_last_datetime = self.latest_permanent_datetime()

        dates_to_redo = self.find_newly_permanent_date_range(
            earliest_temp_datetime, global_permanent_range_last_datetime)

        if dates_to_redo is not None:
            self.scrape_days(dates_to_redo[0], dates_to_redo[1])
Example #13
0
    def get_rows_with_null_rating(self):
        """
        Return query result for locations with rating IS NULL.

        :returns: SQLAlchemy query result.
        """
        query = SESSION.query(
            Location.rating,
            Location.document_id,
            Location.street_number,
            Location.address
        ).join(
            Detail
        ).filter(
            Location.rating.is_(None)
        ).filter(
            Detail.document_recorded >= '{}'.format(self.initial_date)
        ).filter(
            Detail.document_recorded <= '{}'.format(self.until_date)
        ).all()

        log.debug('Rows with rating is NULL: {}'.format(len(query)))

        SESSION.close()

        return query
Example #14
0
    def build_features_json(query):
        """TODO."""
        log.debug(len(query))
        features = []
        features_dict = {}
        for row in query:
            # log.debug(row.buyers)
            if not row.location_publish:
                row.document_date = row.document_date + "*"

            if not row.permanent_flag:
                row.document_date = row.document_date + u"\u2020"

            features_dict = {
                "type": "Feature",
                "properties": {
                    "document_date": row.document_date,
                    "address": row.address,
                    "location_info": row.location_info,
                    "amount": row.amount,
                    "buyers": row.buyers,
                    "sellers": row.sellers,
                    "instrument_no": row.instrument_no,
                    "location_publish": row.location_publish,
                    "permanent_flag": row.permanent_flag
                },
                "geometry": {
                    "type": "Point",
                    "coordinates": [row.longitude, row.latitude]
                }
            }
            features.append(features_dict)

        return features
Example #15
0
    def check_permanent_status_of_new_sales(self):
        """
        Examine first-time sales and assign True or False for permanent_flag.
        """
        log.debug('Check permanent status of new sales')

        # TODO: Is this function called for sales that have already been given
        # a False flag? Need to change that if so, because this only looks
        # for sales with no flag. Could change to check for None or False.

        # Get dates to inspect
        earliest_datetime = self.earliest_date_no_flag()
        latest_datetime = self.latest_date_no_flag()

        # For all folders (dates)
        while earliest_datetime != (latest_datetime + timedelta(days=1)):
            current_iteration_date = earliest_datetime.strftime('%Y-%m-%d')

            early_permanent_datetime = self.find_early_perm_date_when_scraped(
                current_iteration_date)

            late_permanent_datetime = self.find_late_perm_date_when_scraped(
                current_iteration_date)

            # For this date that is currently considered temporary (whether by
            # default or because it was previously confirmed to be temporary),
            # check on the permanent date range at the time of the scrape.

            self.update_this_dates_permanent_flag(earliest_datetime,
                                                  early_permanent_datetime,
                                                  late_permanent_datetime)

            earliest_datetime += timedelta(days=1)
Example #16
0
    def get_locations(self):
        """Return SQL query of locations table for given date range."""
        log.debug('get_locations')

        subquery = SESSION.query(
            Location.document_id,
            func.bool_and(Location.location_publish).label('location_publish'),
            func.string_agg(
                cast(Location.street_number, Text) + ' ' +
                cast(Location.address, Text), '; ').label('address'),
            func.string_agg(
                'Unit: ' + cast(Location.unit, Text) + ', ' + 'Condo: ' +
                cast(Location.condo, Text) + ', ' + 'Weeks: ' +
                cast(Location.weeks, Text) + ', ' + 'Subdivision: ' +
                cast(Location.subdivision, Text) + ', ' + 'District: ' +
                cast(Location.district, Text) + ', ' + 'Square: ' +
                cast(Location.square, Text) + ', ' + 'Lot: ' +
                cast(Location.lot, Text), '; ').label('location_info')
            # todo: Once SQLAlchemy supports this, add these fields this way.
            # 'mode() WITHIN GROUP (ORDER BY locations.zip_code) AS zip_code',
            # 'mode() WITHIN GROUP (ORDER BY locations.latitude) AS latitude',
            # 'mode() WITHIN GROUP (ORDER BY locations.longitude) ' +
            # ' AS longitude',
            # 'mode() WITHIN GROUP (ORDER BY locations.neighborhood) ' +
            # 'AS neighborhood'
        ).group_by(Location.document_id).subquery()

        # log.debug(subquery)

        SESSION.close()

        return subquery
Example #17
0
    def get_existing_until_date(self):
        """TODO: Docstring."""
        query_until_date = SESSION.query(
            Detail.document_recorded
        ).order_by(
            Detail.document_recorded.desc()
        ).limit(1).all()

        # Check if any records at all
        if len(query_until_date) == 0:
            # Make it so initialized date range will start from beginning.
            until_date = OPENING_DATE - timedelta(days=1)
            # log.debug(until_date)
            # log.debug(type(until_date))
        else:
            log.debug(len(query_until_date))
            for row in query_until_date:
                # TODO: will this fail w/o .one()?
                until_date = row.document_recorded

            # log.debug(until_date)
            # log.debug(type(until_date))

        # log.debug(until_date)

        SESSION.close()

        return until_date
Example #18
0
    def drop_tables(self):
        '''DROP all tables except those for PostGIS.'''

        # gather all data first before dropping anything.
        # some DBs lock after things have been dropped in
        # a transaction.

        log.debug('drop_tables')

        metadata = MetaData()
        tables = []
        all_foreign_keys = []

        for table_name in self.inspector.get_table_names():
            foreign_keys = []
            for foreign_key in self.inspector.get_foreign_keys(table_name):
                if not foreign_key['name']:
                    continue
                foreign_keys.append(
                    ForeignKeyConstraint((), (), name=foreign_key['name'])
                )
            table = Table(table_name, metadata, *foreign_keys)
            tables.append(table)
            all_foreign_keys.extend(foreign_keys)

        for foreign_key in all_foreign_keys:
            self.conn.execute(DropConstraint(foreign_key))

        for table in tables:
            # This table is part of PostGIS extension.
            if table.name == 'spatial_ref_sys':
                continue
            self.conn.execute(DropTable(table))

        self.trans.commit()
Example #19
0
    def _import_neighorhoods(self):
        """Import neighborhood shapefiles."""
        # TODO: This causes errors on second run.

        log.debug('import_neighorhoods')

        p1 = Popen(
            [
                'shp2pgsql',
                '-I',
                '-a',  # Append data to existing table. Don't create.
                ('{}/neighborhoods/shapefile/Neighborhood_Statistical_Areas'
                 ).format(GEO_DIR),
                'neighborhoods'
            ],
            stdout=PIPE)

        p2 = Popen(['psql', '-d', DATABASE_NAME], stdin=p1.stdout, stdout=PIPE)

        p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
        p2.communicate()[0]

        # If need to alter geometry's SRID
        self.conn.execute("""
            SELECT updategeometrysrid('neighborhoods', 'geom', 3452);""")
        self.conn.execute("""
            ALTER TABLE neighborhoods
            ALTER COLUMN geom TYPE geometry(MultiPolygon, 4326)
            USING ST_Transform(geom, 4326);""")
Example #20
0
    def create_db(self):
        """Create database."""
        log.debug('create_db')

        call(['createdb', DATABASE_NAME])  # Create database
        self._database_connection()  # Connect to database
        self._add_db_extensions()  # Add Postgres extensions
Example #21
0
    def update_cleaned_geom(self):
        """Update the PostGIS geom field in the cleaned table."""
        log.debug('Update Cleaned geometry')

        sql = """UPDATE cleaned
            SET geom = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);"""

        self.engine.execute(sql)
Example #22
0
    def update_cleaned_geom(self):
        """Update the PostGIS geom field in the cleaned table."""
        log.debug('Update Cleaned geometry')

        sql = """UPDATE cleaned
            SET geom = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);"""

        self.engine.execute(sql)
Example #23
0
    def __init__(self, initial_date=None, until_date=None):
        """Initialize self variables and establish connection to database."""
        self.engine = create_engine(ENGINE_STRING)

        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: {}'.format(self.initial_date))
        log.debug('self.until_date: {}'.format(self.until_date))
Example #24
0
    def __init__(self, initial_date=None, until_date=None):
        """Initialize self variables and establish connection to database."""
        self.engine = create_engine(ENGINE_STRING)

        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: {}'.format(self.initial_date))
        log.debug('self.until_date: {}'.format(self.until_date))
Example #25
0
    def _spatial_index_on_cleaned_geom(self):
        """Create spatial index on cleaned table."""
        log.debug('spatial_index_on_cleaned_geom')

        sql = """
            CREATE INDEX index_cleaned_geom
            ON cleaned
            USING GIST(geom);"""

        self.conn.execute(sql)
Example #26
0
    def _spatial_index_on_cleaned_geom(self):
        """Create spatial index on cleaned table."""
        log.debug('spatial_index_on_cleaned_geom')

        sql = """
            CREATE INDEX index_cleaned_geom
            ON cleaned
            USING GIST(geom);"""

        self.conn.execute(sql)
Example #27
0
    def get_details(self):
        """Return SQL query of details table for given date range."""
        subquery = SESSION.query(Detail).filter(
            Detail.document_recorded >= '{}'.format(self.initial_date)).filter(
                Detail.document_recorded <= '{}'.format(self.until_date)
            ).subquery()

        log.debug(subquery)

        SESSION.close()

        return subquery
Example #28
0
    def send_email(self, msg):
        """Initialize and send the email."""
        log.debug('Mail')

        s = smtplib.SMTP('smtp.gmail.com', 587)
        s.ehlo()
        s.starttls()
        s.ehlo()
        s.login(os.environ.get('REAL_ESTATE_GMAIL_USERNAME'),
                os.environ.get('REAL_ESTATE_GMAIL_PASSWORD'))
        s.sendmail(self.frm, self.to, msg.as_string())
        s.quit()
Example #29
0
    def form_dict(self):
        """
        Return dict of this sale's detail table using class self variables.

        :returns: A dict containing all of the details table values.
        """
        log.debug('form_dict')

        dict_output = self.__dict__

        del dict_output['rows']

        return dict_output
Example #30
0
    def form_dict(self):
        """
        Return dict of this sale's detail table using class self variables.

        :returns: A dict containing all of the details table values.
        """
        log.debug('form_dict')

        dict_output = self.__dict__

        del dict_output['rows']

        return dict_output
Example #31
0
    def drop_db():
        '''
        Deletes the database.
        '''

        log.debug('drop DB')

        try:
            call([
                'dropdb',
                '%s' % DATABASE_NAME
            ])
        except Exception as error:
            log.debug(error, exc_info=True)
Example #32
0
    def commit_to_database(self, table, output):
        """Commit to database using nested transactions and exceptions."""
        try:
            # TODO: Is this the correct method for this?
            with SESSION.begin_nested():
                i = insert(getattr(db, table))
                vals = i.values(output)
                SESSION.execute(vals)  # TODO: What is this?
                SESSION.flush()
        except Exception as error:
            log.debug(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()  # TODO: Should this be here?
Example #33
0
    def commit_to_database(self, table, output):
        """Commit to database using nested transactions and exceptions."""
        try:
            # TODO: Is this the correct method for this?
            with SESSION.begin_nested():
                i = insert(getattr(db, table))
                vals = i.values(output)
                SESSION.execute(vals)  # TODO: What is this?
                SESSION.flush()
        except Exception as error:
            log.debug(error, exc_info=True)
            SESSION.rollback()

        SESSION.commit()  # TODO: Should this be here?
Example #34
0
    def __init__(self, initial_date=None, until_date=None):
        """Initialize self variables and establish connection to database."""
        engine_string = (
            'host=localhost dbname={0} user={1} password={2}').format(
                DATABASE_NAME, os.environ.get('REAL_ESTATE_DATABASE_USERNAME'),
                os.environ.get('REAL_ESTATE_DATABASE_PASSWORD'))
        self.conn = psycopg2.connect(engine_string)
        self.cursor = self.conn.cursor()

        self.initial_date = initial_date
        self.until_date = until_date

        log.debug('self.initial_date: {}'.format(self.initial_date))
        log.debug('self.until_date: {}'.format(self.until_date))
Example #35
0
    def no_neighborhood_found(self):
        """If no neighborhood is found, update with "None" in nbhd field."""
        log.debug('no_neighborhood_found')

        SESSION.query(
            Location
        ).filter(
            Location.neighborhood.is_(None)
        ).update(
            {Location.neighborhood: "None"},
            synchronize_session='fetch'
        )

        SESSION.commit()
Example #36
0
    def get_details(self):
        """Return SQL query of details table for given date range."""
        subquery = SESSION.query(
            Detail
        ).filter(
            Detail.document_recorded >= '{}'.format(self.initial_date)
        ).filter(
            Detail.document_recorded <= '{}'.format(self.until_date)
        ).subquery()

        log.debug(subquery)

        SESSION.close()

        return subquery
Example #37
0
    def scrape_days(early_date, late_date):
        """docstring"""

        early_datetime = datetime.strptime(early_date, '%Y-%m-%d')
        log.debug(early_datetime)
        late_datetime = datetime.strptime(late_date, '%Y-%m-%d')
        log.debug(early_datetime)

        # Scrape those days over again
        log.info('scrape')
        try:
            Scrape(initial_date=early_datetime,
                   until_date=late_datetime).main()
        except Exception as error:
            log.error(error, exc_info=True)
Example #38
0
    def vacuum_database():
        '''VACUUM the database.'''

        log.debug('vacuum_database')

        # Make sure to get rid of deleted rows
        try:
            call([
                'psql',
                '%s' % DATABASE_NAME,
                '-c',
                'VACUUM;'
            ])
        except Exception as error:
            log.debug(error, exc_info=True)
Example #39
0
    def post_search(self, data):
        """Process incoming POST data."""
        log.debug('post_search')

        data = self.decode_data(data)
        data = self.convert_entries_to_db_friendly(data)

        log.debug(data)

        # If a geo query (search near me). Not yet a feature.
        # if 'latitude' in data and 'longitude' in data:
        #     response = self.geoquery_db(data)
        # else:
        response = self.mapquery_db(data)

        return response
Example #40
0
def searchbar_input():
    """
    Receive a POST call from the autocomplete dropdown.

    Return a dict of suggestions.

    :param query: The search bar input.
    :type query: string
    :returns: A dict of matching suggestions.
    """
    term = request.args.get('q')

    log.debug('term: %s', term)

    data = Models().searchbar_input(term)
    return data
Example #41
0
    def get_vendees(self):
        """Return SQL query of vendees table for given date range."""
        log.debug('get_vendees')

        subquery = SESSION.query(
            Vendee.document_id,
            func.string_agg(
                cast(Vendee.vendee_firstname, Text) + " " +
                cast(Vendee.vendee_lastname, Text),
                ', ').label('buyers')).group_by(Vendee.document_id).subquery()

        # log.debug(subquery)

        SESSION.close()

        return subquery
Example #42
0
    def do_not_filter_by_map(self, data):
        """TODO."""
        query = self.find_all_publishable_rows_fitting_criteria(data)
        # data['page_length'] = self.PAGE_LENGTH
        data['number_of_records'] = len(query)  # number of records
        # total number of pages:
        data['number_of_pages'] = int(
            math.ceil(
                float(data['number_of_records']) / float(data['page_length'])))

        data = self.update_pager(data)

        query = self.find_page_of_publishable_rows_fitting_criteria(data)

        log.debug(query)

        return query
Example #43
0
    def get_rows_from_query(self):
        """Convert query result to row of dicts."""
        log.debug('get_rows_from_query')

        query = self.join_subqueries()

        rows = []

        for row in query:
            # dict_val = row.__dict__  # Old
            dict_val = dict(zip(row.keys(), row))  # New. TODO: Check it works.

            rows.append(dict_val)

        log.debug('len(rows): {}'.format(len(rows)))

        return rows
Example #44
0
    def scrape_days(early_date, late_date):
        """docstring"""

        early_datetime = datetime.strptime(early_date, '%Y-%m-%d')
        log.debug(early_datetime)
        late_datetime = datetime.strptime(late_date, '%Y-%m-%d')
        log.debug(early_datetime)

        # Scrape those days over again
        log.info('scrape')
        try:
            Scrape(
                initial_date=early_datetime,
                until_date=late_datetime
            ).main()
        except Exception as error:
            log.error(error, exc_info=True)
Example #45
0
    def get_rows_from_query(self):
        """Convert query result to row of dicts."""
        log.debug('get_rows_from_query')

        query = self.join_subqueries()

        rows = []

        for row in query:
            # dict_val = row.__dict__  # Old
            dict_val = dict(zip(row.keys(), row))  # New. TODO: Check it works.

            rows.append(dict_val)

        log.debug('len(rows): {}'.format(len(rows)))

        return rows
Example #46
0
    def main(self):
        """Run through each check method."""
        log.debug('Publish')
        print('Publishing...')

        self.make_all_locations_publishable()
        self.check_geocoder_bad_rating()
        self.check_geocoder_good_rating()
        self.check_west_of_new_orleans()
        self.check_east_of_new_orleans()
        self.check_north_of_new_orleans()
        self.check_south_of_new_orleans()
        self.make_all_details_publishable()
        self.check_if_no_date()
        self.check_relative_date()
        self.check_low_amount()
        self.check_high_amount()
Example #47
0
    def get_search(self, request):
        """
        GET call for /realestate/search.

        :param request: The request object(?).
        :returns: A data dict, SQL query result and JS data.
        """
        data = self.parse_query_string(request)
        data = self.decode_data(data)
        data = self.convert_entries_to_db_friendly(data)

        data['update_date'] = self.get_last_updated_date()
        data['neighborhoods'] = self.get_neighborhoods()

        data = self.determine_pages(data)

        query = self.find_page_of_publishable_rows_fitting_criteria(data)

        for row in query:
            row.amount = get_num_with_curr_sign(row.amount)
            row.document_date = ymd_to_full_date(
                (row.document_date).strftime('%Y-%m-%d'), no_day=True)

        features = self.build_features_json(query)

        jsdata = {"type": "FeatureCollection", "features": features}

        data['results_css_display'] = 'none'

        if data['number_of_records'] == 0:
            data['current_page'] = 0
            data['results_css_display'] = 'block'

        data = self.revert_entries(data)

        data['map_button_state'] = False

        data['results_language'] = ResultsLanguage(data).main()

        log.debug('data')

        return data, query, jsdata
Example #48
0
    def get_vendors(self):
        """Return SQL query of vendors table for given date range."""
        log.debug('get_vendors')

        subquery = SESSION.query(
            Vendor.document_id,
            func.string_agg(
                cast(Vendor.vendor_firstname, Text) + " " +
                cast(Vendor.vendor_lastname, Text),
                ', '
            ).label('sellers')
        ).group_by(
            Vendor.document_id
        ).subquery()

        # log.debug(subquery)

        SESSION.close()

        return subquery
Example #49
0
    def build_all(self):
        """Run through all of the building methods."""
        log.debug('Build all')
        print('Building...')

        log.debug('Detail')
        print('\nAdding to details table for:')
        self.dict_parse('DetailParser', 'Detail')

        log.debug('Vendor')
        print('\nAdding to vendors table for:')
        self.list_parse('VendorParser', 'Vendor')

        log.debug('Vendee')
        print('\nAdding to vendees table for:')
        self.list_parse('VendeeParser', 'Vendee')

        log.debug('Location')
        print('\nAdding to locations table for:')
        self.list_parse('LocationParser', 'Location')
Example #50
0
    def get_locations(self):
        """Return SQL query of locations table for given date range."""
        log.debug('get_locations')

        subquery = SESSION.query(
            Location.document_id,
            func.bool_and(Location.location_publish).label('location_publish'),
            func.string_agg(
                cast(Location.street_number, Text) + ' ' +
                cast(Location.address, Text),
                '; '
            ).label('address'),
            func.string_agg(
                'Unit: ' + cast(Location.unit, Text) + ', ' +
                'Condo: ' + cast(Location.condo, Text) + ', ' +
                'Weeks: ' + cast(Location.weeks, Text) + ', ' +
                'Subdivision: ' + cast(Location.subdivision, Text) + ', ' +
                'District: ' + cast(Location.district, Text) + ', ' +
                'Square: ' + cast(Location.square, Text) + ', ' +
                'Lot: ' + cast(Location.lot, Text),
                '; '
            ).label('location_info')
            # todo: Once SQLAlchemy supports this, add these fields this way.
            # 'mode() WITHIN GROUP (ORDER BY locations.zip_code) AS zip_code',
            # 'mode() WITHIN GROUP (ORDER BY locations.latitude) AS latitude',
            # 'mode() WITHIN GROUP (ORDER BY locations.longitude) ' +
            # ' AS longitude',
            # 'mode() WITHIN GROUP (ORDER BY locations.neighborhood) ' +
            # 'AS neighborhood'
        ).group_by(
            Location.document_id
        ).subquery()

        # log.debug(subquery)

        SESSION.close()

        return subquery
Example #51
0
    def dict_parse(self, parser_name, table):
        """Parse data structured in a dict, which is how `details` returns."""
        initial_datetime = datetime.strptime(
            self.initial_date, '%Y-%m-%d').date()
        until_datetime = datetime.strptime(self.until_date, '%Y-%m-%d').date()

        while initial_datetime != (until_datetime + timedelta(days=1)):
            current_date = initial_datetime.strftime('%Y-%m-%d')
            log.debug('Current date: {}'.format(current_date))
            print(current_date)

            glob_string = '{0}/data/raw/{1}/form-html/*.html'.format(
                PROJECT_DIR, current_date)

            # Allows for variable calls to a class.
            # Ex module.Class().method -> parse.parser_name(f).list_output
            for filepath in sorted(glob.glob(glob_string)):
                # log.debug('filepath: {}'.format(filepath))
                dict_output = getattr(parse, parser_name)(filepath).form_dict()

                self.commit_to_database(table, dict_output)

            initial_datetime += timedelta(days=1)
Example #52
0
    def earliest_date_temp_flag(self):
        """Find earliest date with permanent_flag = False."""
        query = SESSION.query(
            func.min(Detail.document_recorded).label('early_date')
        ).filter(
            Detail.permanent_flag.is_(False)  # To satisfy PEP8
        ).all()

        for row in query:
            earliest_temp_date = row.early_date

        if earliest_temp_date is not None:
            earliest_temp_datetime = datetime.combine(
                earliest_temp_date, datetime.min.time())

            log.debug(earliest_temp_datetime)

            SESSION.close()

            return earliest_temp_datetime
        else:
            SESSION.close()
            return None
Example #53
0
    def neighborhood_found(self):
        """Use PostGIS to find which neighborhood a long/lat pair is in."""
        log.debug('neighborhood_found')

        SESSION.query(
            Location
        ).filter(
            func.ST_Contains(
                Neighborhood.geom,
                func.ST_SetSRID(
                    func.ST_Point(
                        cast(Location.longitude, Float),
                        cast(Location.latitude, Float)
                    ),
                    4326
                )
            )
        ).update(
            {Location.neighborhood: Neighborhood.gnocdc_lab},
            synchronize_session='fetch'
        )

        SESSION.commit()
Example #54
0
    def _import_neighorhoods(self):
        """Import neighborhood shapefiles."""
        # TODO: This causes errors on second run.

        log.debug('import_neighorhoods')

        p1 = Popen(
            [
                'shp2pgsql',
                '-I',
                '-a',  # Append data to existing table. Don't create.
                (
                    '{}/neighborhoods/shapefile/Neighborhood_Statistical_Areas'
                ).format(GEO_DIR),
                'neighborhoods'
            ],
            stdout=PIPE)

        p2 = Popen(
            [
                'psql',
                '-d',
                DATABASE_NAME
            ],
            stdin=p1.stdout,
            stdout=PIPE)

        p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
        p2.communicate()[0]

        # If need to alter geometry's SRID
        self.conn.execute("""
            SELECT updategeometrysrid('neighborhoods', 'geom', 3452);""")
        self.conn.execute("""
            ALTER TABLE neighborhoods
            ALTER COLUMN geom TYPE geometry(MultiPolygon, 4326)
            USING ST_Transform(geom, 4326);""")
Example #55
0
    def commit_rows(self, rows):
        """Commit JOIN-ed rows to the cleaned table."""
        log.debug('Committing %d rows', len(rows))

        for count, row in enumerate(rows):
            log.debug("Row %d", count)
            try:
                with SESSION.begin_nested():
                    i = insert(Cleaned)
                    i = i.values(row)
                    SESSION.execute(i)
                    SESSION.flush()
            except Exception as error:
                log.debug('count: %s', count)
                log.exception(error, exc_info=True)
                SESSION.rollback()

            SESSION.commit()

        log.debug('%d rows committed', len(rows))
Example #56
0
    def join_subqueries(self):
        """Run a JOIN on subqueries."""
        log.debug('join_subqueries')

        subq_vendees = self.get_vendees()
        subq_vendors = self.get_vendors()
        subq_location = self.get_locations()

        log.debug('query...')

        query = SESSION.query(
            Detail.document_id,
            Detail.amount,
            Detail.document_date,
            Detail.document_recorded,
            Detail.instrument_no,
            Detail.detail_publish,
            Detail.permanent_flag,
            subq_vendees.c.buyers,
            subq_vendors.c.sellers,
            subq_location.c.location_publish,
            subq_location.c.address,
            subq_location.c.location_info
            # TODO: Once SQLAlchemy supports WITHIN GROUP, uncomment these.
            # subq_location.c.zip_code,
            # subq_location.c.latitude,
            # subq_location.c.longitude,
            # subq_location.c.neighborhood
        ).join(
            subq_vendees
        ).join(
            subq_vendors
        ).join(
            subq_location
        ).filter(
            Detail.document_recorded >= '{}'.format(self.initial_date)
        ).filter(
            Detail.document_recorded <= '{}'.format(self.until_date)
        ).all()

        log.debug('len(query): %d', len(query))

        SESSION.close()

        return query