def add_unique_hash(table_name): """ Adds an md5 hash column of the preexisting columns and removes duplicate rows from a table. :param table_name: Name of table to add hash to. """ logger.info('Begin (table_name: {})'.format(table_name)) add_hash = ''' DROP TABLE IF EXISTS temp; CREATE TABLE temp AS SELECT DISTINCT *, md5(CAST(("{table_name}".*)AS text)) AS hash FROM "{table_name}"; DROP TABLE "{table_name}"; ALTER TABLE temp RENAME TO "{table_name}"; ALTER TABLE "{table_name}" ADD PRIMARY KEY (hash); '''.format(table_name=table_name) try: postgres_engine.execute(add_hash) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to deduplicate with ' + add_hash) logger.info('End.')
def tearDownClass(cls): try: postgres_engine.execute("drop table roadworks") postgres_engine.execute("delete from meta_master where dataset_name = 'roadworks'") except ProgrammingError: pass
def tearDownClass(cls): try: postgres_engine.execute("drop table roadworks") postgres_engine.execute( "delete from meta_master where dataset_name = 'roadworks'") except ProgrammingError: pass
def setUp(self): postgres_session.rollback() # Ensure we have metadata loaded into the database # to mimic the behavior of metadata ingestion preceding file ingestion. drop_meta('dog_park_permits') drop_meta('community_radio_events') drop_meta('public_opera_performances') # Make new MetaTable objects self.unloaded_meta = MetaTable(url='nightvale.gov/events.csv', human_name='Community Radio Events', business_key='Event Name', observed_date='Date', latitude='lat', longitude='lon', approved_status=True) self.existing_meta = MetaTable(url='nightvale.gov/dogpark.csv', human_name='Dog Park Permits', business_key='Hooded Figure ID', observed_date='Date', latitude='lat', longitude='lon', approved_status=False) self.opera_meta = MetaTable(url='nightvale.gov/opera.csv', human_name='Public Opera Performances', business_key='Event Name', observed_date='Date', location='Location', approved_status=False) postgres_session.add_all([self.existing_meta, self.opera_meta, self.unloaded_meta]) postgres_session.commit() # Also, let's have one table pre-loaded... self.existing_table = sa.Table('dog_park_permits', MetaData(), Column('hooded_figure_id', Integer), Column('point_date', TIMESTAMP, nullable=False), Column('date', Date, nullable=True), Column('lat', Float, nullable=False), Column('lon', Float, nullable=False), Column('hash', String(32), primary_key=True), Column('geom', Geometry('POINT', srid=4326), nullable=True)) drop_if_exists(self.existing_table.name) self.existing_table.create(bind=postgres_engine) # ... with some pre-existing data ins = self.existing_table.insert().values(hooded_figure_id=1, point_date=date(2015, 1, 2), lon=-87.6495076896, lat=41.7915865543, geom=None, hash='addde9be7f59e95fc08e54e29b2a947f') postgres_engine.execute(ins)
def delete_absent_hashes(staging_name, existing_name): logger.info('Begin.') logger.info('staging_name: {}'.format(staging_name)) logger.info('existing_name: {}'.format(existing_name)) del_ = """DELETE FROM "{existing}" WHERE hash IN (SELECT hash FROM "{existing}" EXCEPT SELECT hash FROM "{staging}");""".\ format(existing=existing_name, staging=staging_name) try: postgres_engine.execute(del_) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to execute' + del_) logger.info('End.')
def test_location_col_update(self): drop_if_exists(self.opera_meta.dataset_name) self.opera_table = sa.Table(self.opera_meta.dataset_name, MetaData(), Column('event_name', String, primary_key=True), Column('date', Date, nullable=True), Column('location', String, nullable=False), Column('geom', Geometry('POINT', srid=4326), nullable=True), Column('point_date', TIMESTAMP, nullable=False)) drop_if_exists(self.existing_table.name) self.opera_table.create(bind=postgres_engine) ins = self.opera_table.insert().values(event_name='quux', date=None, point_date=date(2015, 1, 2), location='(-87.6495076896,41.7915865543)', geom=None) postgres_engine.execute(ins)
def knn(lng, lat, k, network, sensors): """Execute a spatial query to select k nearest neighbors given some point. :param lng: (float) longitude :param lat: (float) latitude :param k: (int) number of results to return :returns: (list) of nearest k neighbors """ # Convert lng-lat to geojson point point = "'" + json.dumps({ 'type': 'Point', 'coordinates': [lng, lat] }) + "'" # How many to limit the initial bounding box query to k_10 = k * 10 # Based off snippet provided on pg 253 of PostGIS In Action (2nd Edition) query = """ WITH bbox_results AS ( SELECT node, location, array_agg(sensor) AS sensors, (SELECT ST_SetSRID(ST_GeomFromGeoJSON({geojson}), 4326)) AS ref_geom FROM sensor__node_metadata JOIN sensor__sensor_to_node ON id=node WHERE sensor_network = '{network}' GROUP BY node, location ORDER BY location <#> (SELECT ST_SetSRID(ST_GeomFromGeoJSON ({geojson}), 4326)) LIMIT {k_10} ) SELECT node, RANK() OVER(ORDER BY ST_Distance(location, ref_geom)) AS act_r FROM bbox_results WHERE sensors && '{sensors}'::VARCHAR[] ORDER BY act_r LIMIT {k}; """.format( geojson=point, network=network, k=k, k_10=k_10, sensors='{' + ','.join(sensors) + '}' ) return postgres_engine.execute(query).fetchall()
def add(self): staging_name = 'staging_{}'.format(self.table_name) with ETLFile(self.source_path, self.source_url, interpret_as='bytes') as file_helper: handle = open(file_helper.handle.name, "rb") with zipfile.ZipFile(handle) as shapefile_zip: import_shapefile(shapefile_zip, staging_name) add_unique_hash(staging_name) try: postgres_engine.execute('drop table {}'.format(self.table_name)) except ProgrammingError: pass rename_table = 'alter table {} rename to {}' rename_table = rename_table.format(staging_name, self.table_name) postgres_engine.execute(rename_table) self.meta.update_after_ingest() postgres_session.commit()
def __enter__(self): """ Add a table (prefixed with n_) to the database with one record for each record found in the staging table with a hash not present in the existing table. If there are no such records, do nothing. """ # create n_table with point_date, geom, and id columns s = self.staging e = self.existing d = self.dataset derived_dates = func.cast(s.c[d.date], TIMESTAMP).label('point_date') derived_geoms = self._geom_col() cols_to_insert = [s.c['hash'], derived_dates, derived_geoms] # Select the hash and the columns we're deriving from the staging table. sel = select(cols_to_insert) # And limit our results to records # whose hashes aren't already present in the existing table. sel = sel.select_from(s.outerjoin(e, s.c['hash'] == e.c['hash'])).\ where(e.c['hash'] == None) # Drop the table first out of healthy paranoia self._drop() try: self.table.create(bind=postgres_engine) except Exception as e: raise PlenarioETLError(repr(e) + '\nCould not create table n_' + d.name) ins = self.table.insert().from_select(cols_to_insert, sel) # Populate it with records from our select statement. try: postgres_engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel)) else: # Would be nice to check if we have new records or not right here. return self
def knn(lng, lat, k, network, sensors): """Execute a spatial query to select k nearest neighbors given some point. :param lng: (float) longitude :param lat: (float) latitude :param k: (int) number of results to return :returns: (list) of nearest k neighbors """ # Convert lng-lat to geojson point point = "'" + json.dumps({ 'type': 'Point', 'coordinates': [lng, lat] }) + "'" # How many to limit the initial bounding box query to k_10 = k * 10 # Based off snippet provided on pg 253 of PostGIS In Action (2nd Edition) query = """ WITH bbox_results AS ( SELECT node, location, array_agg(sensor) AS sensors, (SELECT ST_SetSRID(ST_GeomFromGeoJSON({geojson}), 4326)) AS ref_geom FROM sensor__node_metadata JOIN sensor__sensor_to_node ON id=node WHERE sensor_network = '{network}' GROUP BY node, location ORDER BY location <#> (SELECT ST_SetSRID(ST_GeomFromGeoJSON ({geojson}), 4326)) LIMIT {k_10} ) SELECT node, RANK() OVER(ORDER BY ST_Distance(location, ref_geom)) AS act_r FROM bbox_results WHERE sensors && '{sensors}'::VARCHAR[] ORDER BY act_r LIMIT {k}; """.format(geojson=point, network=network, k=k, k_10=k_10, sensors='{' + ','.join(sensors) + '}') return postgres_engine.execute(query).fetchall()
def wban2CallSign(wban_code): sql = "SELECT call_sign FROM weather_stations where wban_code='%s'" % wban_code result = engine.execute(sql) # print "result=", result x = result.first() cs = None if x: cs = x[0] # print "wban=", wban else: print(("could not find wban:", wban_code)) return cs
def test_update_with_change(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) table = etl.add() changed_path = os.path.join(fixtures_path, 'community_radio_events_changed.csv') etl = PlenarioETL(self.unloaded_meta, source_path=changed_path) etl.update() sel = sa.select([table.c.date]).where(table.c.event_name == 'baz') changed_date = postgres_engine.execute(sel).fetchone()[0] self.assertEqual(changed_date, date(1993, 11, 10))
def test_update(self): # Try to ingest slightly changed shape fixture = shape_fixtures['changed_neighborhoods'] # Add the fixture to the registry first shape_meta = postgres_session.query(ShapeMetadata).get('chicago_neighborhoods') # Do a ShapeETL update ShapeETL(meta=shape_meta, source_path=fixture.path).update() t = shape_meta.shape_table sel = t.select().where(t.c['sec_neigh'] == 'ENGLEWOOD') res = engine.execute(sel).fetchall() altered_value = res[0]['pri_neigh'] # I changed Englewood to Englerwood :P self.assertEqual(altered_value, 'Englerwood')
def test_update(self): # Try to ingest slightly changed shape fixture = shape_fixtures['changed_neighborhoods'] # Add the fixture to the registry first shape_meta = postgres_session.query(ShapeMetadata).get( 'chicago_neighborhoods') # Do a ShapeETL update ShapeETL(meta=shape_meta, source_path=fixture.path).update() t = shape_meta.shape_table sel = t.select().where(t.c['sec_neigh'] == 'ENGLEWOOD') res = engine.execute(sel).fetchall() altered_value = res[0]['pri_neigh'] # I changed Englewood to Englerwood :P self.assertEqual(altered_value, 'Englerwood')
def insert(self): """ Join with the staging table to insert complete records into existing table. """ derived_cols = [c for c in self.table.c if c.name in {'geom', 'point_date'}] staging_cols = [c for c in self.staging.c] sel_cols = staging_cols + derived_cols sel = select(sel_cols).where(self.staging.c.hash == self.table.c.hash) ins = self.existing.insert().from_select(sel_cols, sel) try: postgres_engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed on statement: ' + str(ins)) try: _null_malformed_geoms(self.existing) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to null out geoms with (0,0) geocoding')
def _null_malformed_geoms(existing): # We decide to set the geom to NULL when the given lon/lat is (0,0) # (off the coast of Africa). upd = existing.update().values(geom=None).\ where(existing.c.geom == select([func.ST_SetSRID(func.ST_MakePoint(0, 0), 4326)])) postgres_engine.execute(upd)
def _add_trigger(self): add_trigger = """CREATE TRIGGER audit_after AFTER DELETE OR UPDATE ON "{table}" FOR EACH ROW EXECUTE PROCEDURE audit.if_modified()""".\ format(table=self.dataset.name) postgres_engine.execute(add_trigger)
def all_callSigns(): sql = "SELECT call_sign FROM weather_stations ORDER BY call_sign" result = engine.execute(sql) return [x[0] for x in result.fetchall()]
def _drop(self): postgres_engine.execute("DROP TABLE IF EXISTS {};".format(self.name))
def drop_meta(table_name): del_ = "DELETE FROM meta_master WHERE dataset_name = '{}';".format(table_name) postgres_engine.execute(del_)