def tearDownClass(cls): try: app_engine.execute("drop table roadworks") app_engine.execute("delete from meta_master where dataset_name = 'roadworks'") except ProgrammingError: pass
def find_intersecting_shapes(geojson): """ Respond with all shape datasets that intersect with the geojson provided. Also include how many geom rows of the dataset intersect. :param geojson: URL encoded geojson. """ fragment = make_fragment_str(extract_first_geometry_fragment(geojson)) try: # First, do a bounding box check on all shape datasets. query = ''' SELECT m.dataset_name FROM meta_shape as m WHERE m.bbox && (SELECT ST_GeomFromGeoJSON('{geojson_fragment}')); '''.format(geojson_fragment=fragment) intersecting_datasets = engine.execute(query) bounding_box_intersection_names = [ dataset.dataset_name for dataset in intersecting_datasets ] except Exception as e: print 'Error finding candidates' raise e try: # Then, for the candidates, get a count of the rows that intersect. response_objects = [] for dataset_name in bounding_box_intersection_names: num_intersections_query = ''' SELECT count(g.geom) as num_geoms FROM {dataset_name} as g WHERE ST_Intersects(g.geom, ST_GeomFromGeoJSON('{geojson_fragment}')) '''.format(dataset_name=dataset_name, geojson_fragment=fragment) num_intersections = engine.execute(num_intersections_query)\ .first().num_geoms if num_intersections > 0: response_objects.append({ 'dataset_name': dataset_name, 'num_geoms': num_intersections }) except Exception as e: print 'Error narrowing candidates' raise e response_skeleton = { 'meta': { 'status': 'ok', 'message': '', }, 'objects': response_objects } resp = make_response(json.dumps(response_skeleton), 200) return resp
def setUp(self): session.rollback() # Ensure we have metadata loaded into the database # to mimic the behavior of metadata ingestion preceding file ingestion. drop_meta('dog_park_permits') drop_meta('community_radio_events') drop_meta('public_opera_performances') # Make new MetaTable objects self.unloaded_meta = MetaTable(url='nightvale.gov/events.csv', human_name='Community Radio Events', business_key='Event Name', observed_date='Date', latitude='lat', longitude='lon', approved_status=True) self.existing_meta = MetaTable(url='nightvale.gov/dogpark.csv', human_name='Dog Park Permits', business_key='Hooded Figure ID', observed_date='Date', latitude='lat', longitude='lon', approved_status=False) self.opera_meta = MetaTable(url='nightvale.gov/opera.csv', human_name='Public Opera Performances', business_key='Event Name', observed_date='Date', location='Location', approved_status=False) session.add_all( [self.existing_meta, self.opera_meta, self.unloaded_meta]) session.commit() # Also, let's have one table pre-loaded... self.existing_table = sa.Table( 'dog_park_permits', MetaData(), Column('hooded_figure_id', Integer), Column('point_date', TIMESTAMP, nullable=False), Column('date', Date, nullable=True), Column('lat', Float, nullable=False), Column('lon', Float, nullable=False), Column('hash', String(32), primary_key=True), Column('geom', Geometry('POINT', srid=4326), nullable=True)) drop_if_exists(self.existing_table.name) self.existing_table.create(bind=app_engine) # ... with some pre-existing data ins = self.existing_table.insert().values( hooded_figure_id=1, point_date=date(2015, 1, 2), lon=-87.6495076896, lat=41.7915865543, geom=None, hash='addde9be7f59e95fc08e54e29b2a947f') app_engine.execute(ins)
def find_intersecting_shapes(geojson): """ Respond with all shape datasets that intersect with the geojson provided. Also include how many geom rows of the dataset intersect. :param geojson: URL encoded geojson. """ fragment = make_fragment_str(extract_first_geometry_fragment(geojson)) try: # First, do a bounding box check on all shape datasets. query = ''' SELECT m.dataset_name FROM meta_shape as m WHERE m.bbox && (SELECT ST_GeomFromGeoJSON('{geojson_fragment}')); '''.format(geojson_fragment=fragment) intersecting_datasets = engine.execute(query) bounding_box_intersection_names = [dataset.dataset_name for dataset in intersecting_datasets] except Exception as e: print 'Error finding candidates' raise e try: # Then, for the candidates, get a count of the rows that intersect. response_objects = [] for dataset_name in bounding_box_intersection_names: num_intersections_query = ''' SELECT count(g.geom) as num_geoms FROM {dataset_name} as g WHERE ST_Intersects(g.geom, ST_GeomFromGeoJSON('{geojson_fragment}')) '''.format(dataset_name=dataset_name, geojson_fragment=fragment) num_intersections = engine.execute(num_intersections_query)\ .first().num_geoms if num_intersections > 0: response_objects.append({ 'dataset_name': dataset_name, 'num_geoms': num_intersections }) except Exception as e: print 'Error narrowing candidates' raise e response_skeleton = { 'meta': { 'status': 'ok', 'message': '', }, 'objects': response_objects } resp = make_response(json.dumps(response_skeleton), 200) return resp
def delete_absent_hashes(staging_name, existing_name): del_ = """DELETE FROM "{existing}" WHERE hash IN (SELECT hash FROM "{existing}" EXCEPT SELECT hash FROM "{staging}");""".\ format(existing=existing_name, staging=staging_name) try: engine.execute(del_) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to execute' + del_)
def table_exists(table_name): """Make an inexpensive query to the database. It the table does not exist, the query will cause a ProgrammingError. :param table_name: (string) table name :returns: (bool) true if the table exists, false otherwise""" try: app_engine.execute("select '{}'::regclass".format(table_name)) return True except ProgrammingError: return False
def setUp(self): session.rollback() # Ensure we have metadata loaded into the database # to mimic the behavior of metadata ingestion preceding file ingestion. drop_meta('dog_park_permits') drop_meta('community_radio_events') drop_meta('public_opera_performances') # Make new MetaTable objects self.unloaded_meta = MetaTable(url='nightvale.gov/events.csv', human_name='Community Radio Events', business_key='Event Name', observed_date='Date', latitude='lat', longitude='lon', approved_status=True) self.existing_meta = MetaTable(url='nightvale.gov/dogpark.csv', human_name='Dog Park Permits', business_key='Hooded Figure ID', observed_date='Date', latitude='lat', longitude='lon', approved_status=False) self.opera_meta = MetaTable(url='nightvale.gov/opera.csv', human_name='Public Opera Performances', business_key='Event Name', observed_date='Date', location='Location', approved_status=False) session.add_all([self.existing_meta, self.opera_meta, self.unloaded_meta]) session.commit() # Also, let's have one table pre-loaded... self.existing_table = sa.Table('dog_park_permits', MetaData(), Column('hooded_figure_id', Integer), Column('point_date', TIMESTAMP, nullable=False), Column('date', Date, nullable=True), Column('lat', Float, nullable=False), Column('lon', Float, nullable=False), Column('hash', String(32), primary_key=True), Column('geom', Geometry('POINT', srid=4326), nullable=True)) drop_if_exists(self.existing_table.name) self.existing_table.create(bind=app_engine) # ... with some pre-existing data ins = self.existing_table.insert().values(hooded_figure_id=1, point_date=date(2015, 1, 2), lon=-87.6495076896, lat=41.7915865543, geom=None, hash='addde9be7f59e95fc08e54e29b2a947f') app_engine.execute(ins)
def _hash_update(staging, existing): delete_absent_hashes(staging.name, existing.name) # Select all records from staging # whose hashes are not present in existing. join_condition = staging.c['hash'] == existing.c['hash'] sel = staging.select()\ .select_from(staging.outerjoin(existing, join_condition)).\ where(existing.c['hash'] == None) # Insert those into existing col_names = [col.name for col in existing.columns] ins = existing.insert().from_select(col_names, sel) try: engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel))
def test_location_col_update(self): drop_if_exists(self.opera_meta.dataset_name) self.opera_table = sa.Table(self.opera_meta.dataset_name, MetaData(), Column('event_name', String, primary_key=True), Column('date', Date, nullable=True), Column('location', String, nullable=False), Column('geom', Geometry('POINT', srid=4326), nullable=True), Column('point_date', TIMESTAMP, nullable=False)) drop_if_exists(self.existing_table.name) self.opera_table.create(bind=app_engine) ins = self.opera_table.insert().values(event_name='quux', date=None, point_date=date(2015, 1, 2), location='(-87.6495076896,41.7915865543)', geom=None) app_engine.execute(ins)
def test_location_col_update(self): drop_if_exists(self.opera_meta.dataset_name) self.opera_table = sa.Table( self.opera_meta.dataset_name, MetaData(), Column('event_name', String, primary_key=True), Column('date', Date, nullable=True), Column('location', String, nullable=False), Column('geom', Geometry('POINT', srid=4326), nullable=True), Column('point_date', TIMESTAMP, nullable=False)) drop_if_exists(self.existing_table.name) self.opera_table.create(bind=app_engine) ins = self.opera_table.insert().values( event_name='quux', date=None, point_date=date(2015, 1, 2), location='(-87.6495076896,41.7915865543)', geom=None) app_engine.execute(ins)
def __enter__(self): """ Add a table (prefixed with n_) to the database with one record for each record found in the staging table with a hash not present in the existing table. If there are no such records, do nothing. """ # create n_table with point_date, geom, and id columns s = self.staging e = self.existing d = self.dataset derived_dates = func.cast(s.c[d.date], TIMESTAMP).label('point_date') derived_geoms = self._geom_col() cols_to_insert = [s.c['hash'], derived_dates, derived_geoms] # Select the hash and the columns we're deriving from the staging table. sel = select(cols_to_insert) # And limit our results to records # whose hashes aren't already present in the existing table. sel = sel.select_from(s.outerjoin(e, s.c['hash'] == e.c['hash'])).\ where(e.c['hash'] == None) # Drop the table first out of healthy paranoia self._drop() try: self.table.create(bind=engine) except Exception as e: raise PlenarioETLError( repr(e) + '\nCould not create table n_' + d.name) ins = self.table.insert().from_select(cols_to_insert, sel) # Populate it with records from our select statement. try: engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel)) else: # Would be nice to check if we have new records or not right here. return self
def __enter__(self): """ Add a table (prefixed with n_) to the database with one record for each record found in the staging table with a hash not present in the existing table. If there are no such records, do nothing. """ # create n_table with point_date, geom, and id columns s = self.staging e = self.existing d = self.dataset derived_dates = func.cast(s.c[d.date], TIMESTAMP).label('point_date') derived_geoms = self._geom_col() cols_to_insert = [s.c['hash'], derived_dates, derived_geoms] # Select the hash and the columns we're deriving from the staging table. sel = select(cols_to_insert) # And limit our results to records # whose hashes aren't already present in the existing table. sel = sel.select_from(s.outerjoin(e, s.c['hash'] == e.c['hash'])).\ where(e.c['hash'] == None) # Drop the table first out of healthy paranoia self._drop() try: self.table.create(bind=engine) except Exception as e: raise PlenarioETLError(repr(e) + '\nCould not create table n_' + d.name) ins = self.table.insert().from_select(cols_to_insert, sel) # Populate it with records from our select statement. try: engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel)) else: # Would be nice to check if we have new records or not right here. return self
def wban2CallSign(wban_code): sql = "SELECT call_sign FROM weather_stations where wban_code='%s'" % wban_code result = engine.execute(sql) #print "result=", result x = result.first() cs = None if x: cs = x[0] #print "wban=", wban else: print "could not find wban:", wban_code return cs
def test_update_with_change(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) table = etl.add() changed_path = os.path.join(fixtures_path, 'community_radio_events_changed.csv') etl = PlenarioETL(self.unloaded_meta, source_path=changed_path) etl.update() sel = sa.select([table.c.date]).where(table.c.event_name == 'baz') changed_date = app_engine.execute(sel).fetchone()[0] self.assertEqual(changed_date, date(1993, 11, 10))
def test_update(self): # Try to ingest slightly changed shape fixture = fixtures['changed_neighborhoods'] # Add the fixture to the registry first shape_meta = session.query(ShapeMetadata).get('chicago_neighborhoods') # Do a ShapeETL update ShapeETL(meta=shape_meta, source_path=fixture.path).update() t = shape_meta.shape_table sel = t.select().where(t.c['sec_neigh'] == 'ENGLEWOOD') res = engine.execute(sel).fetchall() altered_value = res[0]['pri_neigh'] # I changed Englewood to Englerwood :P self.assertEqual(altered_value, 'Englerwood')
def add_unique_hash(table_name): """ Adds an md5 hash column of the preexisting columns and removes duplicate rows from a table. :param table_name: Name of table to add hash to. """ add_hash = ''' DROP TABLE IF EXISTS temp; CREATE TABLE temp AS SELECT DISTINCT *, md5(CAST(("{table_name}".*)AS text)) AS hash FROM "{table_name}"; DROP TABLE "{table_name}"; ALTER TABLE temp RENAME TO "{table_name}"; ALTER TABLE "{table_name}" ADD PRIMARY KEY (hash); '''.format(table_name=table_name) try: engine.execute(add_hash) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to deduplicate with ' + add_hash)
def callSign2Wban(call_sign): sql = "SELECT wban_code FROM weather_stations where call_sign='%s'" % call_sign result = engine.execute(sql) #print "result=", result x = result.first() wban = None if x: wban = x[0] #print "wban=", wban #print "found call sign:", call_sign, "wban=", wban return wban else: #print "could not find call sign:", call_sign return None
def insert(self): """ Join with the staging table to insert complete records into existing table. """ derived_cols = [c for c in self.table.c if c.name in {'geom', 'point_date'}] staging_cols = [c for c in self.staging.c] sel_cols = staging_cols + derived_cols sel = select(sel_cols).where(self.staging.c.hash == self.table.c.hash) ins = self.existing.insert().from_select(sel_cols, sel) try: engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed on statement: ' + str(ins)) try: _null_malformed_geoms(self.existing) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to null out geoms with (0,0) geocoding')
def drop_meta(table_name): del_ = "DELETE FROM meta_master WHERE dataset_name = '{}';".format(table_name) app_engine.execute(del_)
def all_callSigns(): sql = "SELECT call_sign FROM weather_stations ORDER by call_sign" result = engine.execute(sql) return [x[0] for x in result.fetchall()]
def _add_trigger(self): add_trigger = """CREATE TRIGGER audit_after AFTER DELETE OR UPDATE ON "{table}" FOR EACH ROW EXECUTE PROCEDURE audit.if_modified()""".\ format(table=self.dataset.name) engine.execute(add_trigger)
def drop_tables(table_names): drop_template = "DROP TABLE IF EXISTS {};" command = "".join([drop_template.format(table_name) for table_name in table_names]) engine.execute(command)
def drop_meta(table_name): del_ = "DELETE FROM meta_master WHERE dataset_name = '{}';".format( table_name) app_engine.execute(del_)
def _drop(self): engine.execute("DROP TABLE IF EXISTS {};".format(self.name))
def _drop(self): engine.execute("DROP TABLE IF EXISTS {};".format('s_' + self.dataset.name))
def all_callSigns(): sql = "SELECT call_sign FROM weather_stations ORDER by call_sign" result=engine.execute(sql) return [x[0] for x in result.fetchall()]
def _null_malformed_geoms(existing): # We decide to set the geom to NULL when the given lon/lat is (0,0) # (off the coast of Africa). upd = existing.update().values(geom=None).\ where(existing.c.geom == select([func.ST_SetSRID(func.ST_MakePoint(0, 0), 4326)])) engine.execute(upd)
def _drop(self): engine.execute("DROP TABLE IF EXISTS {};" .format('s_' + self.dataset.name))