def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL HttpInput.init(self)
def pg_srs_constraint(self): log.info('set srs constraint') db = PostGIS(self.cfg.get_dict()) srid = self.srid sql = "ALTER TABLE gml_objects DROP CONSTRAINT enforce_srid_gml_bounded_by;" db.tx_execute(sql) sql = "ALTER TABLE gml_objects ADD CONSTRAINT enforce_srid_gml_bounded_by CHECK (st_srid(gml_bounded_by) = (%s));" % srid db.tx_execute(sql)
def write(self, packet): if packet.data is None: return packet log.info('executing SQL') db = PostGIS(self.cfg.get_dict()) rowcount = db.tx_execute(packet.data) log.info('executed SQL, rowcount=%d' % rowcount) return packet
def get_feature_types(self): log.info('reading all featuretypes from DB') db = PostGIS(self.cfg.get_dict()) db.connect() sql = "SELECT id,qname FROM feature_types" db.execute(sql) cur = db.cursor for record in cur: self.feature_type_ids[record[1]] = record[0]
def __init__(self, configdict, section): SqliteDbInput.__init__(self, configdict, section) self.progress_query = self.cfg.get('progress_query') self.progress_update = self.cfg.get('progress_update') # Connect only once to DB log.info('Init: connect to Postgres DB') self.progress_db = PostGIS(self.cfg.get_dict()) self.progress_db.connect()
def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # If no explicit column names given, get from DB meta info self.columns = self.column_names if self.column_names is None: self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema'))
def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL ApacheDirInput.init(self)
class LmlApacheDirInput(ApacheDirInput): """ RIVM LML version for ApacheDirInput: adds check for each file if it is already in DB. """ def __init__(self, configdict, section, produces=FORMAT.record): ApacheDirInput.__init__(self, configdict, section, produces) self.query = self.cfg.get('query') self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL ApacheDirInput.init(self) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def no_more_files(self): return self.file_index == len(self.file_list) - 1 def filter_file(self, file_name): """ Filter the file_name, e.g. to suppress reading if already present in DB. :param file_name: :return string or None: """ if file_name is None or file_name == 'actueel.xml': return None # Populate and execute SELECT query for file_name query = self.query % file_name rowcount = self.db.execute(query) if rowcount > 0: log.info('file %s already present' % file_name) return None # Not yet present return file_name
class RawSensorInput(HttpInput): """ Raw Sensor REST API (CityGIS) version for HttpInput: adds check for each file if it is already in DB. """ def __init__(self, configdict, section, produces=FORMAT.record): HttpInput.__init__(self, configdict, section, produces) self.query = self.cfg.get('query') self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL HttpInput.init(self) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def no_more_files(self): return self.file_index == len(self.file_list) - 1 def filter_file(self, file_name): """ Filter the file_name, e.g. to suppress reading if already present in DB. :param file_name: :return string or None: """ if file_name is None or file_name == 'actueel.xml': return None # Populate and execute SELECT query for file_name query = self.query % file_name rowcount = self.db.execute(query) if rowcount > 0: log.info('file %s already present' % file_name) return None # Not yet present return file_name
def init(self, config_dict): self.config_dict = config_dict self.process_name = config_dict['process_name'] self.db = PostGIS(config_dict) self.db.connect() ids = dict() parameters = dict() models = dict() state = dict() # Query ANN Calibration Model and its State from DB for each calibrated sensor. if self.model_query is not None and len(self.sensor_model_names) > 0: log.info('Getting calibration models and state from database') for k in self.sensor_model_names: v = self.sensor_model_names[k] id, param, model = self.query_model(v) ids[k] = id parameters[k] = param models[k] = model model_state = self.query_state(id) state[k] = model_state else: log.info('No query for fetching calibration models given or no ' 'mapping for calibration models to gas components given.') # Put Model and State info in the Device definitions. for k in ids: SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k] for k in parameters: SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k] for k in models: SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k] for k, v in state.iteritems(): for device_id, device_state in v.iteritems(): for gas, state in device_state.iteritems(): v[device_id][gas] = RunningMean.from_dict(state) SENSOR_DEFS[k]['converter_model']['state'] = v
class DeegreeBlobstoreInput(Input): """ Read features from deegree Blobstore DB into an etree doc. produces=FORMAT.etree_doc """ # Start attribute config meta @Config(ptype=int, required=False, default=10000) def max_features_per_doc(self): """ Max features to read from input feature GML stream per internal document. """ pass @Config(ptype=str, required=True, default=None) def start_container(self): """ Tag that starts container. """ pass @Config(ptype=str, required=True, default=None) def end_container(self): """ Tag that ends container. """ pass @Config(ptype=str, required=False, default=False) def start_feature_tag(self): """ XML tag that starts Feature. """ pass @Config(ptype=str, required=False, default=None) def end_feature_tag(self): """ XML tag that ends Feature. """ pass # End attribute config meta def __init__(self, configdict, section): Input.__init__(self, configdict, section, produces=FORMAT.etree_doc) self.cur_feature_blob = None self.rowcount = 0 # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/ self.regex_xlink_href = re.compile( "\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))" ) self.db = None self.xlink_db = None self.buffer = None self.feature_count = 0 # Reusable XML parser self.xml_parser = etree.XMLParser(remove_blank_text=True) def init(self): pass def read(self, packet): if packet.is_end_of_stream(): return packet if self.db is None: # First time read log.info("reading records from blobstore..") self.db = PostGIS(self.cfg.get_dict()) self.db.connect() sql = self.cfg.get('sql') self.rowcount = self.db.execute(sql) self.cur = self.db.cursor log.info("Read records rowcount=%d" % self.rowcount) # Init separate connection to fetch objects referenced by xlink:href self.xlink_db = PostGIS(self.cfg.get_dict()) self.xlink_db.connect() # Query active while self.cur is not None: if self.buffer is None: self.buffer = self.init_buf() self.buffer.write(self.start_container) # Get next blob record record = self.cur.fetchone() # End of all records if record is None: # End of records: start closing self.buffer.write(self.end_container) self.cur = None self.db.commit() # Only create doc if there are features in the buffer if self.feature_count > 0: self.buffer_to_doc(packet) packet.set_end_of_doc() break else: # New record: embed feature blob in feature tags and write to buffer feature_blob = self.write_feature(record) # If we have local xlinks: fetch the related features as well from the DB and # output them within the same document (local href resolvable) # TODO: in some cases we may need to be recursive (xlinks in xlinked features...) # First construct a single query for all xlinks xlink_sql = None for xlink in self.regex_xlink_href.finditer(feature_blob): gml_id = xlink.group(1).strip('"').strip('#') # We don't want multiple occurences of the same xlinked feature if gml_id in self.xlink_ids: continue self.xlink_ids.add(gml_id) if xlink_sql is None: xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id else: xlink_sql += "OR gml_id = '%s'" % gml_id # Should we retrieve and write xlinked features? if xlink_sql is not None: # Fetch from DB self.xlink_db.execute(xlink_sql) while True: # Get next blob record xlink_record = self.xlink_db.cursor.fetchone() if xlink_record is None: break self.write_feature(xlink_record) # Should we output a doc if self.feature_count >= self.max_features_per_doc: # End of records: create XML doc self.buffer.write(self.end_container) self.buffer_to_doc(packet) break if self.cur is None: # All records handled: close off packet.set_end_of_stream() # log.info("[%s]" % packet.data) return packet def write_feature(self, record): feature_blob = str(record[0]) # Write start-tag, blob element, end-tag self.buffer.write(self.start_feature_tag) self.buffer.write(feature_blob) self.buffer.write(self.end_feature_tag) self.feature_count += 1 return feature_blob def init_buf(self): buffer = StringIO() buffer = codecs.getwriter("utf8")(buffer) self.feature_count = 0 self.xlink_ids = set() return buffer def buffer_to_doc(self, packet): # Process/transform data in buffer self.buffer.seek(0) try: packet.data = etree.parse(self.buffer, self.xml_parser) except Exception as e: bufStr = self.buffer.getvalue() if not bufStr: log.info("parse buffer empty: content=[%s]" % bufStr) else: log.error("error in buffer parsing %s" % str(e)) raise self.buffer.close() self.buffer = None
def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() self.init_columns()
def write(self, packet): if packet.data is None: return packet gml_doc = packet.data log.info('inserting features in DB') db = PostGIS(self.cfg.get_dict()) db.connect() # print self.to_string(gml_doc, False, False) # NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'} # featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS) featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag) count = 0 gml_ns = None for childNode in featureMembers: if gml_ns is None: if childNode.nsmap.has_key('gml'): gml_ns = childNode.nsmap['gml'] else: if childNode.nsmap.has_key('GML'): gml_ns = childNode.nsmap['GML'] gml_id = childNode.get('{%s}id' % gml_ns) feature_type_id = self.feature_type_ids[childNode.tag] # Find a GML geometry in the GML NS ogrGeomWKT = None # gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS) gmlMembers = childNode.xpath( ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']") geom_str = None for gmlMember in gmlMembers: if geom_str is None: geom_str = etree.tostring(gmlMember) # no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT # ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr)) # if ogrGeom is not None: # ogrGeomWKT = ogrGeom.ExportToWkt() # if ogrGeomWKT is not None: # break blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8') if geom_str is None: sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)" parameters = (gml_id, feature_type_id, db.make_bytea(blob)) else: # ST_SetSRID(ST_GeomFromGML(%s)),-1) sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )" parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid) if db.execute(sql, parameters) == -1: log.error("feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob)) # will fail but we will close connection also db.commit() # proceed... log.info('retrying to proceed with remaining features...') db = PostGIS(self.cfg.get_dict()) db.connect() count = 0 count += 1 exception = db.commit() if exception is not None: log.error("error in commit") log.info("inserted %s features" % count) return packet
def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # One time: get all device ids self.fetch_devices()
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join([ "%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % (self.cfg.get( 'table'), ",".join(['%s ' % k for k in record]), ",".join([ "%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect()
def init(self): InfluxDbInput.init(self) # PostGIS for tracking Harvesting progress. # Tracking is automatically updated via a TRIGGER (see db-schema-raw). postgis_cfg = { 'host': self.pg_host, 'port': self.pg_port, 'database': self.pg_database, 'user': self.pg_user, 'password': self.pg_password, 'schema': self.pg_schema } self.tracking_db = PostGIS(postgis_cfg) self.tracking_db.connect() # One time: get all measurements and related info and store in structure measurements = self.get_measurement_names() for measurement in measurements: # Optional mapping from MEASUREMENT name to a device id # Otherwise device_is is Measurement name device_id = measurement if self.meas_name_to_device_id: if measurement not in self.meas_name_to_device_id: log.warn('No device_id mapped for measurement (table) %s' % measurement) continue device_id = self.meas_name_to_device_id[measurement] date_start_s, start_ts = self.get_start_time(measurement) date_end_s, end_ts = self.get_end_time(measurement) start_ts = self.date_str_to_whole_hour_nanos(date_start_s) end_ts *= NANOS_FACTOR # Shift time for current_ts from progress table if already in progress # otherwise use start time of measurement. current_ts = start_ts row_count = self.tracking_db.execute(self.progress_query + device_id) if row_count > 0: # Already in progress progress_rec = self.tracking_db.cursor.fetchone() ymd_last = str(progress_rec[4]) year_last = ymd_last[0:4] month_last = ymd_last[4:6] day_last = ymd_last[6:] hour_last = progress_rec[5] # e.g. 2017-11-17T11:00:00.411Z date_str = '%s-%s-%sT%d:00:00.000Z' % (year_last, month_last, day_last, hour_last - 1) current_ts = self.date_str_to_whole_hour_nanos(date_str) # skip to next hour # current_ts += (3600 * NANOS_FACTOR) # Store all info per device (measurement table) in list of dict self.measurements_info.append({ 'name': measurement, 'date_start_s': date_start_s, 'start_ts': start_ts, 'date_end_s': date_end_s, 'end_ts': end_ts, 'current_ts': current_ts, 'device_id': device_id }) print("measurements_info: %s" % str(self.measurements_info))
class HarvesterInfluxDbInput(InfluxDbInput): """ InfluxDB TimeSeries (History) fetcher/formatter. Fetching all timeseries data from InfluxDB and putting these unaltered into recods e.g. for storing later in Postgres DB. This is a continuous process. Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are in harvesting. Algorithm: * fetch all Measurements (table names) * for each Measurement: * if Measurement (name) is not in progress-table insert and set day,hour to 0 * if in progress-table fetch entry (day, hour) * get timeseries (hours) available for that day * fetch and store each, starting with the last hour previously stored * ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process) * stored entry: measurement, day, hour, json blob * finish: when all done or when max_proc_time_secs passed """ @Config(ptype=int, default=None, required=True) def max_proc_time_secs(self): """ The maximum time in seconds we should continue processing input. Required: True Default: None """ pass @Config(ptype=str, default=None, required=True) def device_type(self): """ The station/device type, e.g. 'ase'. Required: False Default: None """ pass @Config(ptype=str, default=None, required=True) def device_version(self): """ The station/device version, e.g. '1'. Required: False Default: None """ pass @Config(ptype=dict, default=None, required=False) def meas_name_to_device_id(self): """ How to map InfluxDB Measurement (table) names to SE device id's. e.g. {'Geonovum1' : '1181001', 'RIVM2' : '1181002'} Required: False Default: None """ pass @Config(ptype=str, default=None, required=True) def progress_table(self): """ The Postgres table tracking all last processed days/hours for each device. Required: True Default: None """ pass @Config(ptype=str, required=False, default='localhost') def pg_host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def pg_port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=True) def pg_database(self): """ database name """ pass @Config(ptype=str, required=False, default='postgres') def pg_user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def pg_password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def pg_schema(self): """ The postgres schema name, defaults to 'public' """ pass def __init__(self, configdict, section): InfluxDbInput.__init__(self, configdict, section) self.current_time_secs = lambda: int(round(time.time())) self.start_time_secs = self.current_time_secs() self.progress_query = "SELECT * from %s where device_id=" % self.progress_table self.measurements_info = [] self.index_m = -1 self.query = "SELECT * FROM %s WHERE time >= %d AND time < %d + 1h" self.tracking_db = None def init(self): InfluxDbInput.init(self) # PostGIS for tracking Harvesting progress. # Tracking is automatically updated via a TRIGGER (see db-schema-raw). postgis_cfg = { 'host': self.pg_host, 'port': self.pg_port, 'database': self.pg_database, 'user': self.pg_user, 'password': self.pg_password, 'schema': self.pg_schema } self.tracking_db = PostGIS(postgis_cfg) self.tracking_db.connect() # One time: get all measurements and related info and store in structure measurements = self.get_measurement_names() for measurement in measurements: # Optional mapping from MEASUREMENT name to a device id # Otherwise device_is is Measurement name device_id = measurement if self.meas_name_to_device_id: if measurement not in self.meas_name_to_device_id: log.warn('No device_id mapped for measurement (table) %s' % measurement) continue device_id = self.meas_name_to_device_id[measurement] date_start_s, start_ts = self.get_start_time(measurement) date_end_s, end_ts = self.get_end_time(measurement) start_ts = self.date_str_to_whole_hour_nanos(date_start_s) end_ts *= NANOS_FACTOR # Shift time for current_ts from progress table if already in progress # otherwise use start time of measurement. current_ts = start_ts row_count = self.tracking_db.execute(self.progress_query + device_id) if row_count > 0: # Already in progress progress_rec = self.tracking_db.cursor.fetchone() ymd_last = str(progress_rec[4]) year_last = ymd_last[0:4] month_last = ymd_last[4:6] day_last = ymd_last[6:] hour_last = progress_rec[5] # e.g. 2017-11-17T11:00:00.411Z date_str = '%s-%s-%sT%d:00:00.000Z' % (year_last, month_last, day_last, hour_last - 1) current_ts = self.date_str_to_whole_hour_nanos(date_str) # skip to next hour # current_ts += (3600 * NANOS_FACTOR) # Store all info per device (measurement table) in list of dict self.measurements_info.append({ 'name': measurement, 'date_start_s': date_start_s, 'start_ts': start_ts, 'date_end_s': date_end_s, 'end_ts': end_ts, 'current_ts': current_ts, 'device_id': device_id }) print("measurements_info: %s" % str(self.measurements_info)) def all_done(self): return len(self.measurements_info) == 0 def has_expired(self): if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs: return True return False def next_measurement_info(self): self.index_m += 1 return self.measurements_info[self.index_m % len(self.measurements_info)] def del_measurement_info(self): if not self.all_done(): del self.measurements_info[self.index_m % len(self.measurements_info)] def before_invoke(self, packet): if self.has_expired() or self.all_done(): # All devices read or timer expiry log.info('Processing halted: expired or all done') packet.set_end_of_stream() return False # def next_whole_hour_from_date(self, date): # date_s = self.query_db('SELECT FIRST(calibrated), time FROM %s' % measurement)[0]['time'] # return parser.parse(date_s) def date_str_to_whole_hour_nanos(self, date_str): """ COnvert URZ date time string to timestamp nanos on whole hour. :param date_str: :return: """ timestamp = self.date_str_to_ts_nanos(date_str) # print(timestamp) # Shift timestamp to next whole hour timestamp = (timestamp - (timestamp % 3600)) * NANOS_FACTOR # d = datetime.utcfromtimestamp(timestamp) # print('-> %s' % d.isoformat()) return timestamp def read(self, packet): measurement_info = self.next_measurement_info() current_ts_nanos = measurement_info['current_ts'] current_ts_secs = current_ts_nanos / NANOS_FACTOR query = self.query % (measurement_info['name'], current_ts_nanos, current_ts_nanos) data = self.query_db(query) if len(data) >= 1: d = datetime.utcfromtimestamp(current_ts_secs) day = d.strftime('%Y%m%d') hour = str(d.hour + 1).zfill(2) # DEBUG: store only first and last of hour-series data_first = {'time': data[0]['time']} data_last = {'time': data[len(data) - 1]['time']} # data_o = data # data = [data_first, data_last] # for i in range(0,4): # data.append(data_o[i]) record = self.format_data(measurement_info['device_id'], day, hour, data) packet.data = None if record['complete']: packet.data = record # Shift time an hour for this device current_ts_nanos = (current_ts_secs + 3600) * NANOS_FACTOR if current_ts_nanos > measurement_info['end_ts']: # all done for current measurement/device self.del_measurement_info() else: # Shift to next hour for this measurement measurement_info['current_ts'] = current_ts_nanos return packet # Create a data record for timeseries of current device/day/hour def format_data(self, device_id, day, hour, data): # # -- Map this to # CREATE TABLE smartem_raw.timeseries ( # gid serial, # unique_id character varying (16), # insert_time timestamp with time zone default current_timestamp, # device_id integer, # day integer, # hour integer, # data json, # complete boolean default false, # PRIMARY KEY (gid) # ); # Create record with JSON text blob with metadata record = dict() record['unique_id'] = '%s-%s-%s' % (device_id, day, hour) # Timestamp of sample record['device_id'] = device_id record['device_type'] = self.device_type record['device_version'] = self.device_version record['day'] = day record['hour'] = hour # Determine if hour is "complete" record['complete'] = False d = datetime.utcfromtimestamp(self.current_time_secs()) cur_day = int(d.strftime('%Y%m%d')) cur_hour = d.hour + 1 if cur_day > int(day) \ or (cur_day == int(day) and cur_hour > int(hour)): record['complete'] = True # Optional prefix for each param, usually sensor-box type e.g. "ase_" # if self.data_param_prefix: # for data_elm in data: # keys = data_elm.keys() # # https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary # for key in keys: # data_elm[self.data_param_prefix + key] = data_elm.pop(key) # Add JSON text blob record['data'] = json.dumps({ 'id': device_id, 'date': day, 'hour': hour, 'timeseries': data }) return record
def read(self, packet): if packet.is_end_of_stream(): return packet if self.db is None: # First time read log.info("reading records from blobstore..") self.db = PostGIS(self.cfg.get_dict()) self.db.connect() sql = self.cfg.get('sql') self.rowcount = self.db.execute(sql) self.cur = self.db.cursor log.info("Read records rowcount=%d" % self.rowcount) # Init separate connection to fetch objects referenced by xlink:href self.xlink_db = PostGIS(self.cfg.get_dict()) self.xlink_db.connect() # Query active while self.cur is not None: if self.buffer is None: self.buffer = self.init_buf() self.buffer.write(self.start_container) # Get next blob record record = self.cur.fetchone() # End of all records if record is None: # End of records: start closing self.buffer.write(self.end_container) self.cur = None self.db.commit() # Only create doc if there are features in the buffer if self.feature_count > 0: self.buffer_to_doc(packet) packet.set_end_of_doc() break else: # New record: embed feature blob in feature tags and write to buffer feature_blob = self.write_feature(record) # If we have local xlinks: fetch the related features as well from the DB and # output them within the same document (local href resolvable) # TODO: in some cases we may need to be recursive (xlinks in xlinked features...) # First construct a single query for all xlinks xlink_sql = None for xlink in self.regex_xlink_href.finditer(feature_blob): gml_id = xlink.group(1).strip('"').strip('#') # We don't want multiple occurences of the same xlinked feature if gml_id in self.xlink_ids: continue self.xlink_ids.add(gml_id) if xlink_sql is None: xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id else: xlink_sql += "OR gml_id = '%s'" % gml_id # Should we retrieve and write xlinked features? if xlink_sql is not None: # Fetch from DB self.xlink_db.execute(xlink_sql) while True: # Get next blob record xlink_record = self.xlink_db.cursor.fetchone() if xlink_record is None: break self.write_feature(xlink_record) # Should we output a doc if self.feature_count >= self.max_features_per_doc: # End of records: create XML doc self.buffer.write(self.end_container) self.buffer_to_doc(packet) break if self.cur is None: # All records handled: close off packet.set_end_of_stream() # log.info("[%s]" % packet.data) return packet
class DeegreeBlobstoreInput(Input): """ Read features from deegree Blobstore DB into an etree doc. produces=FORMAT.etree_doc """ # Start attribute config meta @Config(ptype=int, required=False, default=10000) def max_features_per_doc(self): """ Max features to read from input feature GML stream per internal document. """ pass @Config(ptype=str, required=True, default=None) def start_container(self): """ Tag that starts container. """ pass @Config(ptype=str, required=True, default=None) def end_container(self): """ Tag that ends container. """ pass @Config(ptype=str, required=False, default=False) def start_feature_tag(self): """ XML tag that starts Feature. """ pass @Config(ptype=str, required=False, default=None) def end_feature_tag(self): """ XML tag that ends Feature. """ pass # End attribute config meta def __init__(self, configdict, section): Input.__init__(self, configdict, section, produces=FORMAT.etree_doc) self.cur_feature_blob = None self.rowcount = 0 # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/ self.regex_xlink_href = re.compile("\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))") self.db = None self.xlink_db = None self.buffer = None self.feature_count = 0 # Reusable XML parser self.xml_parser = etree.XMLParser(remove_blank_text=True) def init(self): pass def read(self, packet): if packet.is_end_of_stream(): return packet if self.db is None: # First time read log.info("reading records from blobstore..") self.db = PostGIS(self.cfg.get_dict()) self.db.connect() sql = self.cfg.get('sql') self.rowcount = self.db.execute(sql) self.cur = self.db.cursor log.info("Read records rowcount=%d" % self.rowcount) # Init separate connection to fetch objects referenced by xlink:href self.xlink_db = PostGIS(self.cfg.get_dict()) self.xlink_db.connect() # Query active while self.cur is not None: if self.buffer is None: self.buffer = self.init_buf() self.buffer.write(self.start_container) # Get next blob record record = self.cur.fetchone() # End of all records if record is None: # End of records: start closing self.buffer.write(self.end_container) self.cur = None self.db.commit() # Only create doc if there are features in the buffer if self.feature_count > 0: self.buffer_to_doc(packet) packet.set_end_of_doc() break else: # New record: embed feature blob in feature tags and write to buffer feature_blob = self.write_feature(record) # If we have local xlinks: fetch the related features as well from the DB and # output them within the same document (local href resolvable) # TODO: in some cases we may need to be recursive (xlinks in xlinked features...) # First construct a single query for all xlinks xlink_sql = None for xlink in self.regex_xlink_href.finditer(feature_blob): gml_id = xlink.group(1).strip('"').strip('#') # We don't want multiple occurences of the same xlinked feature if gml_id in self.xlink_ids: continue self.xlink_ids.add(gml_id) if xlink_sql is None: xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id else: xlink_sql += "OR gml_id = '%s'" % gml_id # Should we retrieve and write xlinked features? if xlink_sql is not None: # Fetch from DB self.xlink_db.execute(xlink_sql) while True: # Get next blob record xlink_record = self.xlink_db.cursor.fetchone() if xlink_record is None: break self.write_feature(xlink_record) # Should we output a doc if self.feature_count >= self.max_features_per_doc: # End of records: create XML doc self.buffer.write(self.end_container) self.buffer_to_doc(packet) break if self.cur is None: # All records handled: close off packet.set_end_of_stream() # log.info("[%s]" % packet.data) return packet def write_feature(self, record): feature_blob = str(record[0]) # Write start-tag, blob element, end-tag self.buffer.write(self.start_feature_tag) self.buffer.write(feature_blob) self.buffer.write(self.end_feature_tag) self.feature_count += 1 return feature_blob def init_buf(self): buffer = StringIO() buffer = codecs.getwriter("utf8")(buffer) self.feature_count = 0 self.xlink_ids = set() return buffer def buffer_to_doc(self, packet): # Process/transform data in buffer self.buffer.seek(0) try: packet.data = etree.parse(self.buffer, self.xml_parser) except Exception as e: bufStr = self.buffer.getvalue() if not bufStr: log.info("parse buffer empty: content=[%s]" % bufStr) else: log.error("error in buffer parsing %s" % str(e)) raise self.buffer.close() self.buffer = None
def write(self, packet): if packet.data is None: return packet gml_doc = packet.data log.info('inserting features in DB') db = PostGIS(self.cfg.get_dict()) db.connect() # print self.to_string(gml_doc, False, False) # NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'} # featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS) featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag) count = 0 gml_ns = None for childNode in featureMembers: if gml_ns is None: if childNode.nsmap.has_key('gml'): gml_ns = childNode.nsmap['gml'] else: if childNode.nsmap.has_key('GML'): gml_ns = childNode.nsmap['GML'] gml_id = childNode.get('{%s}id' % gml_ns) feature_type_id = self.feature_type_ids[childNode.tag] # Find a GML geometry in the GML NS ogrGeomWKT = None # gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS) gmlMembers = childNode.xpath( ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']" ) geom_str = None for gmlMember in gmlMembers: if geom_str is None: geom_str = etree.tostring(gmlMember) # no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT # ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr)) # if ogrGeom is not None: # ogrGeomWKT = ogrGeom.ExportToWkt() # if ogrGeomWKT is not None: # break blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8') if geom_str is None: sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)" parameters = (gml_id, feature_type_id, db.make_bytea(blob)) else: # ST_SetSRID(ST_GeomFromGML(%s)),-1) sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )" parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid) if db.execute(sql, parameters) == -1: log.error( "feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob)) # will fail but we will close connection also db.commit() # proceed... log.info('retrying to proceed with remaining features...') db = PostGIS(self.cfg.get_dict()) db.connect() count = 0 count += 1 exception = db.commit() if exception is not None: log.error("error in commit") log.info("inserted %s features" % count) return packet
def init(self): InfluxDbInput.init(self) postgis_cfg = { 'host': self.pg_host, 'port': self.pg_port, 'database': self.pg_database, 'user': self.pg_user, 'password': self.pg_password, 'schema': self.pg_schema } self.db = PostGIS(postgis_cfg) self.db.connect() # One time: get all measurements and related info and store in structure self.measurements = self.query_db('SHOW MEASUREMENTS') for measurement in self.measurements: measurement_name = measurement['name'] date_start_s = self.query_db( 'SELECT FIRST(calibrated), time FROM %s' % measurement_name)[0]['time'] start_ts = self.date_str_to_ts_nanos(date_start_s) date_end_s = self.query_db( 'SELECT LAST(calibrated), time FROM %s' % measurement_name)[0]['time'] end_ts = self.date_str_to_ts_nanos(date_end_s) device_id = measurement_name if self.meas_name_to_device_id: if measurement_name not in self.meas_name_to_device_id: log.error( 'No device_id mapped for measurement (table) %s' % measurement_name) raise Exception device_id = self.meas_name_to_device_id[measurement_name] # Shift time for current_ts from progress table if already in progress # otherwise use start time of measurement. current_ts = start_ts row_count = self.db.execute(self.progress_query + device_id) if row_count > 0: progress_rec = self.db.cursor.fetchone() ymd_last = str(progress_rec[4]) year_last = ymd_last[0:4] month_last = ymd_last[4:6] day_last = ymd_last[6:] hour_last = progress_rec[5] # e.g. 2017-11-17T11:00:00.411Z date_str = '%s-%s-%sT%d:00:00.0Z' % (year_last, month_last, day_last, hour_last) current_ts = self.date_str_to_ts_nanos(date_str) # skip to next hour current_ts += (3600 * NANOS_FACTOR) # Store all info per device (measurement table) in list of dict self.measurements_info.append({ 'name': measurement_name, 'date_start_s': date_start_s, 'start_ts': start_ts, 'date_end_s': date_end_s, 'end_ts': end_ts, 'current_ts': current_ts, 'device_id': device_id }) print(str(self.measurements_info))
class RawSensorTimeseriesInput(RawSensorAPIInput): """ Raw Sensor REST API (CityGIS and Intemo servers) TimeSeries (History) fetcher/formatter. Fetching all timeseries data via the Raw Sensor API (RSA) from CityGIS server and putting these unaltered into Postgres DB. This is a continuus process. Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are in harvesting. Algoritm: - fetch all (sensor) devices from RSA - for each device: - if device is not in progress-table insert and set day,hour to 0 - if in progress-table fetch entry (day, hour) - get timeseries (hours) available for that day - fetch and store each, starting with the last hour perviously stored - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process) - stored entry: device_id, day, hour, last_flag, json blob - finish: when all done or when max_proc_time_secs passed """ @Config(ptype=int, default=None, required=True) def max_proc_time_secs(self): """ The maximum time in seconds we should continue processing input. Required: True Default: None """ pass @Config(ptype=str, default=None, required=True) def progress_table(self): """ The Postgres table tracking all last processed days/hours for each device. Required: True Default: None """ pass def __init__(self, configdict, section, produces=FORMAT.record_array): RawSensorAPIInput.__init__(self, configdict, section, produces) # keep track of root base REST URL self.url = None self.current_time_secs = lambda: int(round(time.time())) self.start_time_secs = self.current_time_secs() self.days = [] self.days_idx = -1 self.day = -1 self.day_last = -1 self.hours = [] self.hours_idx = -1 self.hour = -1 self.hour_last = -1 self.db = None self.progress_query = "SELECT * from %s where device_id=" % self.progress_table def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # One time: get all device ids self.fetch_devices() # Pick a first device id # self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx) def all_done(self): if self.device_ids_idx < 0 and self.days_idx < 0 and self.hours_idx < 0: return True return False def has_expired(self): if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs: return True return False def fetch_ts_days(self): self.days_idx = -1 self.days = [] self.day = -1 if self.device_id < 0: return ts_days_url = self.base_url + '/devices/%d/timeseries' % self.device_id log.info('Init: fetching timeseries days list from URL: "%s" ...' % ts_days_url) json_str = self.read_from_url(ts_days_url) json_obj = self.parse_json_str(json_str) # Typical entry is: "/sensors/v1/devices/8/timeseries/20160404" # cut of last days_raw = json_obj['days'] row_count = self.db.execute(self.progress_query + str(self.device_id)) self.day_last = -1 self.hour_last = -1 if row_count > 0: progress_rec = self.db.cursor.fetchone() self.day_last = progress_rec[4] self.hour_last = progress_rec[5] # Take a subset of all days: namely those still to be processed # Always include the last/current day as it may not be complete for d in days_raw: day = int(d.split('/')[-1]) if day >= self.day_last: self.days.append(day) if len(self.days) > 0: self.days_idx = 0 log.info('Device: %d, raw days: %d, days=%d, day_last=%d, hour_last=%d' % (self.device_id, len(days_raw), len(self.days), self.day_last, self.hour_last)) def fetch_ts_hours(self): self.hours_idx = -1 self.hours = [] self.hour = None if self.device_id == -1 or self.day == -1: return # 2016-10-30 08:12:09,921 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7 # 2016-10-30 08:12:09,922 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ... # 2016-10-30 08:12:10,789 RawSensorAPI INFO 1 processable hours for device 55 day 20161030 # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7) # 2016-10-30 08:26:59,172 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7 # 2016-10-30 08:26:59,172 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ... # 2016-10-30 08:26:59,807 RawSensorAPI INFO 1 processable hours for device 55 day 20161030 # 2016-10-30 08:26:59,808 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030/8 # 2016-10-30 10:37:30,010 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ... # 2016-10-30 10:37:30,170 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9 # 2016-10-30 10:37:30,170 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:37:30,525 RawSensorAPI INFO 1 processable hours for device 71 day 20161030 # 2016-10-30 10:37:30,525 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-10 (it is still sampling current hour 9) # 2016-10-30 10:47:17,095 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9 # 2016-10-30 10:47:17,095 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:47:17,511 RawSensorAPI INFO 1 processable hours for device 71 day 20161030 # 2016-10-30 10:47:17,511 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/10 # 2016-10-30 10:57:12,325 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ... # 2016-10-30 10:57:12,524 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=10 # 2016-10-30 10:57:12,524 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:57:12,952 RawSensorAPI INFO 0 processable hours for device 71 day 20161030 # 2016-10-30 12:29:11,534 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/11 cur_day=20161030 cur_hour=11 # 2016-10-30 12:29:13,177 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-12 (it is still sampling current hour 11) ts_hours_url = self.base_url + '/devices/%d/timeseries/%d' % (self.device_id, self.day) log.info('Init: fetching timeseries hours list from URL: "%s" ...' % ts_hours_url) # Set the next "last values" URL for device and increment to next json_str = self.read_from_url(ts_hours_url) json_obj = self.parse_json_str(json_str) hours_all = json_obj['hours'] # Get the current day and hour in UTC current_day, current_hour = self.get_current_day_hour() for h in hours_all: hour = int(h) if self.day > self.day_last or (self.day == self.day_last and hour > self.hour_last): if self.day_last == current_day and hour - 1 >= current_hour: # never append the last hour of today log.info('Skip current hour from %d to %d for device %d on day %d' % (hour-1, hour, self.device_id, self.day)) else: self.hours.append(hour) if len(self.hours) > 0: self.hours_idx = 0 log.info('processable hours for device %d day %d: %s' % (self.device_id, self.day, str(self.hours))) def next_day(self): # All days for current device done? Try next device if self.day == -1: self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx) # If not yet all devices done fetch days current device if self.device_id > -1: self.fetch_ts_days() self.day, self.days_idx = self.next_entry(self.days, self.days_idx) def next_hour(self): # Pick an hour entry self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx) while self.hour < 0: # Pick a next day entry self.day, self.days_idx = self.next_entry(self.days, self.days_idx) if self.day < 0: self.next_day() if self.day > -1: self.fetch_ts_hours() if self.device_id < 0: log.info('Processing all devices done') break # Pick an hour entry self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx) def get_current_day_hour(self): # Get the current day and hour in UTC current_time = time.gmtime() current_day = int(time.strftime('%Y%m%d', current_time)) current_hour = int(time.strftime('%H',current_time)) return current_day, current_hour def before_invoke(self, packet): """ Called just before Component invoke. """ # Try to fill in: should point to next hour timeseries REST URL self.url = None if self.has_expired() or self.all_done(): # All devices read or timer expiry log.info('Processing halted: expired or all done') packet.set_end_of_stream() return False self.next_hour() # Get the current day and hour in UTC current_day, current_hour = self.get_current_day_hour() # Skip harvesting the current hour as it will not yet be complete, so try the next device, hour # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7) skips = 0 while self.day == current_day and (self.hour - 1) == current_hour and not self.all_done(): skips += 1 log.info('Skip #%d: device-day-hour: %d-%d-%d (still sampling current hour %d)' % (skips, self.device_id, self.day, self.hour, current_hour)) # Force to skip to next device, sometimes we have an even later hour self.next_hour() # 30.okt.16: Fix for #24 #25 gaps in data: because next_hour() may jump to next device and unconditionally fetch current hour... # so fix is to use while loop until a valid hour available or we are all done # Still hours? if self.hour > 0: # The base method read() will fetch self.url until it is set to None # <base_url>/devices/14/timeseries/20160603/18 self.url = self.base_url + '/devices/%d/timeseries/%d/%d' % (self.device_id, self.day, self.hour) log.info('self.url = %s cur_day=%d cur_hour=%d' % (self.url, current_day, current_hour)) if self.device_id < 0: log.info('Processing all devices done') return True # ASSERT : still device(s) to be done get next hour to process return True # Create a data record for timeseries of current device/day/hour def format_data(self, data): # # -- Map this to # CREATE TABLE smartem_raw.timeseries ( # gid serial, # unique_id character varying (16), # insert_time timestamp with time zone default current_timestamp, # device_id integer, # day integer, # hour integer, # data json, # complete boolean default false, # PRIMARY KEY (gid) # ); # Create record with JSON text blob with metadata record = dict() record['unique_id'] = '%d-%d-%d' % (self.device_id, self.day, self.hour) # Timestamp of sample record['device_id'] = self.device_id record['day'] = self.day record['hour'] = self.hour # Add JSON text blob record['data'] = data return record
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting single record into Postgres database. Input is a record (Python dic structure) or a Python list of dicts (records). Creates an INSERT for Postgres to insert each single record. consumes=FORMAT.record """ def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.db = None self.key = self.cfg.get('key') def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys()))) log.info('query is %s', query) return query def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create query once if self.query is None: self.query = self.create_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.db.execute(self.query, record.values()) self.db.commit(close=False) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.db.execute(self.query, rec.values()) self.db.commit(close=False) log.info('committed %d records' % len(record)) return packet
class Josene(Device): def __init__(self): Device.__init__(self, 'jose') self.model_query = "SELECT id,parameters,model from calibration_models WHERE predicts = '%s' AND invalid = FALSE ORDER BY timestamp DESC LIMIT 1" self.state_query = "SELECT state from calibration_state WHERE process = '%s' AND model_id = %d ORDER BY timestamp DESC LIMIT 1" self.state_insert = "INSERT INTO calibration_state (process, model_id, state) VALUES ('%s', %d, '%s')" self.sensor_model_names = { 'co': 'carbon_monoxide__air_', 'no2': 'nitrogen_dioxide__air_', 'o3': 'ozone__air_' } self.config_dict = None def init(self, config_dict): self.config_dict = config_dict self.process_name = config_dict['process_name'] self.db = PostGIS(config_dict) self.db.connect() ids = dict() parameters = dict() models = dict() state = dict() # Query ANN Calibration Model and its State from DB for each calibrated sensor. if self.model_query is not None and len(self.sensor_model_names) > 0: log.info('Getting calibration models and state from database') for k in self.sensor_model_names: v = self.sensor_model_names[k] id, param, model = self.query_model(v) ids[k] = id parameters[k] = param models[k] = model model_state = self.query_state(id) state[k] = model_state else: log.info('No query for fetching calibration models given or no ' 'mapping for calibration models to gas components given.') # Put Model and State info in the Device definitions. for k in ids: SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k] for k in parameters: SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k] for k in models: SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k] for k, v in state.iteritems(): for device_id, device_state in v.iteritems(): for gas, state in device_state.iteritems(): v[device_id][gas] = RunningMean.from_dict(state) SENSOR_DEFS[k]['converter_model']['state'] = v def exit(self): # Save the calibration state. for k in self.sensor_model_names: model = SENSOR_DEFS[k]['converter_model'] self.save_state(model['model_id'], json.dumps(model['state'])) self.db.commit(close=False) def get_sensor_defs(self): return SENSOR_DEFS def raw_query(self, query_str): self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records def query_model(self, name): query = self.model_query % name log.info('Getting calibration model with query: %s' % query) ret = self.raw_query(query) if len(ret) > 0: id, parameters, model = ret[0] return id, parameters, pickle.loads(model) else: log.warn("No model found for %s" % name) return None, {}, {} def query_state(self, model_id): query = self.state_query % (self.process_name, model_id) log.info('Getting calibration model state with query: %s' % query) ret = self.raw_query(query) if len(ret) > 0: return ret[0][0] else: log.warn("No state found for model_id=%d" % model_id) return {} def save_state(self, model_id, state): insert_query = self.state_insert % (self.process_name, model_id, state) log.info('Inserting calibration model state for process %s model_id=%d' % (self.process_name, model_id)) ret = self.db.execute(insert_query) if ret != 1: log.warn('Cannot save state for process %s model_id=%d' % (self.process_name, model_id)) # Get raw sensor value or list of values def get_raw_value(self, name, val_dict): val = None if type(name) is list: name = name[0] return self.get_raw_value(name, val_dict) # name is list of names # for n in name: # if n in val_dict: # if val is None: # val = [] # val.append(val_dict[n]) else: # name is single name if name in val_dict: val = val_dict[name] if 'audio' in name: # We may have audio encoded in 3 bands bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)] val = bands[0] return val, name # Check for valid sensor value def check_value(self, name, val_dict, value=None): val = None if type(name) is list: # name is list of names for n in name: result, reason = self.check_value(n, val_dict, value) if result is False: return result, reason else: # name is single name if name not in val_dict and value is None: return False, '%s not present' % name else: if value is not None: val = value else: val = val_dict[name] if val is None: return False, '%s is None' % name if name not in SENSOR_DEFS: return False, '%s not in SENSOR_DEFS' % name name_def = SENSOR_DEFS[name] # Audio inputs: need to unpack 3 bands and check for decibel vals if 'audio' in name: bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)] # determine validity of these 3 bands dbMin = name_def['min'] dbMax = name_def['max'] err_cnt = 0 msg = '' for i in range(0, len(bands)): band_val = bands[i] # accumulate outliers if band_val < dbMin: err_cnt +=1 msg += '%s: val(%s) < min(%s)\n' % (name, str(band_val), str(name_def['min'])) elif band_val > dbMax: err_cnt +=1 msg += '%s: val(%s) > max(%s)\n' % (name, str(band_val), str(name_def['max'])) # Only invalid if all bands outside range if err_cnt >= len(bands): return False, msg return True, '%s OK' % name if 'min' in name_def and val < name_def['min']: return False, '%s: val(%s) < min(%s)' % (name, str(val), str(name_def['min'])) if 'max' in name_def and val > name_def['max']: return False, '%s: val(%s) > max(%s)' % (name, str(val), str(name_def['max'])) return True, '%s OK' % name # Get location as lon, lat def get_lon_lat(self, val_dict): result = (None, None) if 's_longitude' in val_dict and 's_latitude' in val_dict: lon = SENSOR_DEFS['longitude']['converter'](val_dict['s_longitude']) lat = SENSOR_DEFS['latitude']['converter'](val_dict['s_latitude']) valid, reason = self.check_value('latitude', val_dict, value=lat) if not valid: return result valid, reason = self.check_value('longitude', val_dict, value=lon) if not valid: return result result = (lon, lat) return result
def delete_features(self): log.info('deleting ALL features in DB') db = PostGIS(self.cfg.get_dict()) db.tx_execute("TRUNCATE gml_objects")
class ProgressTracker(Filter): """" Filter to track progress of a stream of processed records. Stores progress (last id, last timestamp etc) in Postgres table. """ @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass @Config(ptype=str, required=False, default='progress') def table(self): """ Table name, defaults to 'progress'. """ pass @Config(ptype=str, required=True) def progress_update_query(self): """ Query to update progress Required: True Default: "" """ pass @Config(ptype=str, required=True) def id_key(self): """ Key to select id from record array Required: True """ @Config(ptype=str, default=None, required=False) def name_key(self): """ Key to select name from record array Required: True """ def __init__(self, config_dict, section): Filter.__init__(self, config_dict, section, consumes=[FORMAT.record_array, FORMAT.record], produces=[FORMAT.record_array, FORMAT.record]) self.last_ids = None self.db = None def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def invoke(self, packet): self.last_ids = dict() if packet.data is None or packet.is_end_of_doc() or packet.is_end_of_stream(): log.info("No packet data or end of doc/stream") return packet record_in = packet.data if type(record_in) is not list: record_in = [record_in] for record in record_in: if self.name_key is not None: name = record[self.name_key] else: name = "all" if len(record) > 0: new = record[self.id_key] self.last_ids[name] = max(self.last_ids.get(name, -1), new) log.info("Last ids are: %s", str(self.last_ids)) return packet def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """ for name in self.last_ids: param_tuple = (self.last_ids[name], name) log.info('Updating progress table with (id=%d, name=%s)' % param_tuple) self.db.execute(self.progress_update_query % param_tuple) self.db.commit(close=False) log.info('Update progress table ok') else: log.info('No update for progress table') return True
class PostgresDbInput(SqlDbInput): """ Input by querying records from a Postgres database. Input is a query, like SELECT * from mytable. Output is zero or more records as record array (array of dict) or single record (dict). produces=FORMAT.record_array (default) or FORMAT.record """ # Start attribute config meta @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to `'5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass # End attribute config meta def __init__(self, configdict, section): SqlDbInput.__init__(self, configdict, section, produces=[FORMAT.record_array, FORMAT.record]) self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # If no explicit column names given, get from DB meta info self.columns = self.column_names if self.column_names is None: self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema')) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def raw_query(self, query_str): self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records
class RawSensorTimeseriesInput(RawSensorAPIInput): """ Raw Sensor REST API (CityGIS) to fetch (harvest) all timeseries for all devices. """ @Config(ptype=int, default=None, required=True) def max_proc_time_secs(self): """ The maximum time in seconds we should continue processing input. Required: True Default: None """ pass @Config(ptype=str, default=None, required=True) def progress_table(self): """ The Postgres table tracking all last processed days/hours for each device. Required: True Default: None """ pass """ Raw Sensor REST API (CityGIS) TimeSeries (History) fetcher/formatter. Fetching all timeseries data via the Raw Sensor API (RSA) from CityGIS server and putting these unaltered into Postgres DB. This is a continuus process. Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are in harvesting. Algoritm: - fetch all (sensor) devices from RSA - for each device: - if device is not in progress-table insert and set day,hour to 0 - if in progress-table fetch entry (day, hour) - get timeseries (hours) available for that day - fetch and store each, starting with the last hour perviously stored - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process) - stored entry: device_id, day, hour, last_flag, json blob - finish: when all done or when max_proc_time_secs passed """ def __init__(self, configdict, section, produces=FORMAT.record_array): RawSensorAPIInput.__init__(self, configdict, section, produces) # keep track of root base REST URL self.url = None self.current_time_secs = lambda: int(round(time.time())) self.start_time_secs = self.current_time_secs() self.days = [] self.days_idx = -1 self.day = -1 self.day_last = -1 self.hours = [] self.hours_idx = -1 self.hour = -1 self.hour_last = -1 self.db = None self.progress_query = "SELECT * from %s where device_id=" % self.progress_table def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # One time: get all device ids self.fetch_devices() # Pick a first device id # self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx) def all_done(self): if self.device_ids_idx < 0 and self.days_idx < 0 and self.hours_idx < 0: return True return False def has_expired(self): if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs: return True return False def fetch_ts_days(self): self.days_idx = -1 self.days = [] self.day = -1 if self.device_id < 0: return ts_days_url = self.base_url + '/devices/%d/timeseries' % self.device_id log.info('Init: fetching timeseries days list from URL: "%s" ...' % ts_days_url) json_str = self.read_from_url(ts_days_url) json_obj = self.parse_json_str(json_str) # Typical entry is: "/sensors/v1/devices/8/timeseries/20160404" # cut of last days_raw = json_obj['days'] row_count = self.db.execute(self.progress_query + str(self.device_id)) self.day_last = -1 self.hour_last = -1 if row_count > 0: progress_rec = self.db.cursor.fetchone() self.day_last = progress_rec[4] self.hour_last = progress_rec[5] # Take a subset of all days: namely those still to be processed # Always include the last/current day as it may not be complete for d in days_raw: day = int(d.split('/')[-1]) if day >= self.day_last: self.days.append(day) if len(self.days) > 0: self.days_idx = 0 log.info('Device: %d, raw days: %d, days=%d, day_last=%d, hour_last=%d' % (self.device_id, len(days_raw), len(self.days), self.day_last, self.hour_last)) def fetch_ts_hours(self): self.hours_idx = -1 self.hours = [] self.hour = None if self.device_id == -1 or self.day == -1: return # 2016-10-30 08:12:09,921 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7 # 2016-10-30 08:12:09,922 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ... # 2016-10-30 08:12:10,789 RawSensorAPI INFO 1 processable hours for device 55 day 20161030 # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7) # 2016-10-30 08:26:59,172 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7 # 2016-10-30 08:26:59,172 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ... # 2016-10-30 08:26:59,807 RawSensorAPI INFO 1 processable hours for device 55 day 20161030 # 2016-10-30 08:26:59,808 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030/8 # 2016-10-30 10:37:30,010 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ... # 2016-10-30 10:37:30,170 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9 # 2016-10-30 10:37:30,170 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:37:30,525 RawSensorAPI INFO 1 processable hours for device 71 day 20161030 # 2016-10-30 10:37:30,525 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-10 (it is still sampling current hour 9) # 2016-10-30 10:47:17,095 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9 # 2016-10-30 10:47:17,095 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:47:17,511 RawSensorAPI INFO 1 processable hours for device 71 day 20161030 # 2016-10-30 10:47:17,511 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/10 # 2016-10-30 10:57:12,325 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ... # 2016-10-30 10:57:12,524 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=10 # 2016-10-30 10:57:12,524 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ... # 2016-10-30 10:57:12,952 RawSensorAPI INFO 0 processable hours for device 71 day 20161030 # 2016-10-30 12:29:11,534 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/11 cur_day=20161030 cur_hour=11 # 2016-10-30 12:29:13,177 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-12 (it is still sampling current hour 11) ts_hours_url = self.base_url + '/devices/%d/timeseries/%d' % (self.device_id, self.day) log.info('Init: fetching timeseries hours list from URL: "%s" ...' % ts_hours_url) # Set the next "last values" URL for device and increment to next json_str = self.read_from_url(ts_hours_url) json_obj = self.parse_json_str(json_str) hours_all = json_obj['hours'] # Get the current day and hour in UTC current_day, current_hour = self.get_current_day_hour() for h in hours_all: hour = int(h) if self.day > self.day_last or (self.day == self.day_last and hour > self.hour_last): if self.day_last == current_day and hour - 1 >= current_hour: # never append the last hour of today log.info('Skip current hour from %d to %d for device %d on day %d' % (hour-1, hour, self.device_id, self.day)) else: self.hours.append(hour) if len(self.hours) > 0: self.hours_idx = 0 log.info('processable hours for device %d day %d: %s' % (self.device_id, self.day, str(self.hours))) def next_day(self): # All days for current device done? Try next device if self.day == -1: self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx) # If not yet all devices done fetch days current device if self.device_id > -1: self.fetch_ts_days() self.day, self.days_idx = self.next_entry(self.days, self.days_idx) def next_hour(self): # Pick an hour entry self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx) while self.hour < 0: # Pick a next day entry self.day, self.days_idx = self.next_entry(self.days, self.days_idx) if self.day < 0: self.next_day() if self.day > -1: self.fetch_ts_hours() if self.device_id < 0: log.info('Processing all devices done') break # Pick an hour entry self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx) def get_current_day_hour(self): # Get the current day and hour in UTC current_time = time.gmtime() current_day = int(time.strftime('%Y%m%d', current_time)) current_hour = int(time.strftime('%H',current_time)) return current_day, current_hour def before_invoke(self, packet): """ Called just before Component invoke. """ # Try to fill in: should point to next hour timeseries REST URL self.url = None if self.has_expired() or self.all_done(): # All devices read or timer expiry log.info('Processing halted: expired or all done') packet.set_end_of_stream() return False self.next_hour() # Get the current day and hour in UTC current_day, current_hour = self.get_current_day_hour() # Skip harvesting the current hour as it will not yet be complete, so try the next device, hour # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7) skips = 0 while self.day == current_day and (self.hour - 1) == current_hour and not self.all_done(): skips += 1 log.info('Skip #%d: device-day-hour: %d-%d-%d (still sampling current hour %d)' % (skips, self.device_id, self.day, self.hour, current_hour)) # Force to skip to next device, sometimes we have an even later hour self.next_hour() # 30.okt.16: Fix for #24 #25 gaps in data: because next_hour() may jump to next device and unconditionally fetch current hour... # so fix is to use while loop until a valid hour available or we are all done # Still hours? if self.hour > 0: # The base method read() will fetch self.url until it is set to None # <base_url>/devices/14/timeseries/20160603/18 self.url = self.base_url + '/devices/%d/timeseries/%d/%d' % (self.device_id, self.day, self.hour) log.info('self.url = %s cur_day=%d cur_hour=%d' % (self.url, current_day, current_hour)) if self.device_id < 0: log.info('Processing all devices done') return True # ASSERT : still device(s) to be done get next hour to process return True # Create a data record for timeseries of current device/day/hour def format_data(self, data): # # -- Map this to # CREATE TABLE smartem_raw.timeseries ( # gid serial, # unique_id character varying (16), # insert_time timestamp with time zone default current_timestamp, # device_id integer, # day integer, # hour integer, # data json, # complete boolean default false, # PRIMARY KEY (gid) # ); # Create record with JSON text blob with metadata record = dict() record['unique_id'] = '%d-%d-%d' % (self.device_id, self.day, self.hour) # Timestamp of sample record['device_id'] = self.device_id record['day'] = self.day record['hour'] = self.hour # Add JSON text blob record['data'] = data return record
class PostgresDbInput(SqlDbInput): """ Input by querying records from a Postgres database. Input is a query, like SELECT * from mytable. Output is zero or more records as record array (array of dict) or single record (dict). produces=FORMAT.record_array (default) or FORMAT.record """ # Start attribute config meta @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass # End attribute config meta def __init__(self, configdict, section): SqlDbInput.__init__(self, configdict, section) self.db = None def init_columns(self): if self.columns is not None: # Already initialized, reset columns_names to re-init return if self.column_names is None: # If no explicit column names given, get all columns from DB meta info self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema')) else: # Columns provided: make list self.columns = self.column_names.split(',') def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() self.init_columns() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def raw_query(self, query_str): self.init_columns() self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records
def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect()
class WeewxDbInput(SqliteDbInput): """ Reads weewx raw archive records from SQLite. """ def __init__(self, configdict, section): SqliteDbInput.__init__(self, configdict, section) self.progress_query = self.cfg.get('progress_query') self.progress_update = self.cfg.get('progress_update') # Connect only once to DB log.info('Init: connect to Postgres DB') self.progress_db = PostGIS(self.cfg.get_dict()) self.progress_db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.progress_db.disconnect() def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """ # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S') ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id)) log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local)) self.progress_db.execute(self.progress_update % (self.last_id, ts_local)) self.progress_db.commit(close=False) log.info('Update progress table ok') return True def read(self, packet): # Get last processed id of archive table self.progress_db.execute(self.progress_query) progress_rec = self.progress_db.cursor.fetchone() self.last_id = progress_rec[3] log.info('progress record: %s' % str(progress_rec)) # Fetch next batch of archive records archive_recs = self.do_query(self.query % self.last_id) log.info('read archive_recs: %d' % len(archive_recs)) # No more records to process? if len(archive_recs) == 0: packet.set_end_of_stream() log.info('Nothing to do. All file_records done') return packet # Remember last id processed for next query self.last_id = archive_recs[len(archive_recs)-1].get('dateTime') packet.data = archive_recs # Always stop after batch, otherwise we would continue forever packet.set_end_of_stream() return packet
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % ( self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
class HarvesterInfluxDbInput(InfluxDbInput): """ InfluxDB TimeSeries (History) fetcher/formatter. Fetching all timeseries data from InfluxDB and putting these unaltered into Postgres DB. This is a continuus process. Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are in harvesting. Algoritm: - fetch all Measurements (table names) - for each Measurement: - if Measurement (name) is not in progress-table insert and set day,hour to 0 - if in progress-table fetch entry (day, hour) - get timeseries (hours) available for that day - fetch and store each, starting with the last hour previously stored - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process) - stored entry: measurement, day, hour, json blob - finish: when all done or when max_proc_time_secs passed """ @Config(ptype=int, default=None, required=True) def max_proc_time_secs(self): """ The maximum time in seconds we should continue processing input. Required: True Default: None """ pass @Config(ptype=str, default=None, required=False) def data_param_prefix(self): """ The prefix string to place before each parameter name in data, e.g. 'ase_'. Required: False Default: None """ pass @Config(ptype=dict, default=None, required=False) def meas_name_to_device_id(self): """ How to map InfluxDB Measurement names to SE device id's. e.g. {'Geonovum1' : '1181001', 'RIVM2' : '1181002'} Required: False Default: None """ pass @Config(ptype=str, default=None, required=True) def progress_table(self): """ The Postgres table tracking all last processed days/hours for each device. Required: True Default: None """ pass @Config(ptype=str, required=False, default='localhost') def pg_host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def pg_port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=True) def pg_database(self): """ database name """ pass @Config(ptype=str, required=False, default='postgres') def pg_user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def pg_password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def pg_schema(self): """ The postgres schema name, defaults to 'public' """ pass def __init__(self, configdict, section): InfluxDbInput.__init__(self, configdict, section) self.current_time_secs = lambda: int(round(time.time())) self.start_time_secs = self.current_time_secs() self.progress_query = "SELECT * from %s where device_id=" % self.progress_table self.measurements = None self.measurements_info = [] self.index_m = -1 self.query = "SELECT * FROM %s WHERE time >= %d AND time < %d + 1h" def init(self): InfluxDbInput.init(self) postgis_cfg = { 'host': self.pg_host, 'port': self.pg_port, 'database': self.pg_database, 'user': self.pg_user, 'password': self.pg_password, 'schema': self.pg_schema } self.db = PostGIS(postgis_cfg) self.db.connect() # One time: get all measurements and related info and store in structure self.measurements = self.query_db('SHOW MEASUREMENTS') for measurement in self.measurements: measurement_name = measurement['name'] date_start_s = self.query_db( 'SELECT FIRST(calibrated), time FROM %s' % measurement_name)[0]['time'] start_ts = self.date_str_to_ts_nanos(date_start_s) date_end_s = self.query_db( 'SELECT LAST(calibrated), time FROM %s' % measurement_name)[0]['time'] end_ts = self.date_str_to_ts_nanos(date_end_s) device_id = measurement_name if self.meas_name_to_device_id: if measurement_name not in self.meas_name_to_device_id: log.error( 'No device_id mapped for measurement (table) %s' % measurement_name) raise Exception device_id = self.meas_name_to_device_id[measurement_name] # Shift time for current_ts from progress table if already in progress # otherwise use start time of measurement. current_ts = start_ts row_count = self.db.execute(self.progress_query + device_id) if row_count > 0: progress_rec = self.db.cursor.fetchone() ymd_last = str(progress_rec[4]) year_last = ymd_last[0:4] month_last = ymd_last[4:6] day_last = ymd_last[6:] hour_last = progress_rec[5] # e.g. 2017-11-17T11:00:00.411Z date_str = '%s-%s-%sT%d:00:00.0Z' % (year_last, month_last, day_last, hour_last) current_ts = self.date_str_to_ts_nanos(date_str) # skip to next hour current_ts += (3600 * NANOS_FACTOR) # Store all info per device (measurement table) in list of dict self.measurements_info.append({ 'name': measurement_name, 'date_start_s': date_start_s, 'start_ts': start_ts, 'date_end_s': date_end_s, 'end_ts': end_ts, 'current_ts': current_ts, 'device_id': device_id }) print(str(self.measurements_info)) def all_done(self): return len(self.measurements_info) == 0 def has_expired(self): if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs: return True return False def next_measurement_info(self): self.index_m += 1 return self.measurements_info[self.index_m % len(self.measurements_info)] def del_measurement_info(self): if not self.all_done(): del self.measurements_info[self.index_m % len(self.measurements_info)] def before_invoke(self, packet): if self.has_expired() or self.all_done(): # All devices read or timer expiry log.info('Processing halted: expired or all done') packet.set_end_of_stream() return False def date_str_to_ts_nanos(self, date_str): # See https://aboutsimon.com/blog/2013/06/06/Datetime-hell-Time-zone-aware-to-UNIX-timestamp.html # e.g. 2017-11-17T11:00:00.411Z timestamp = timegm( time.strptime(date_str.replace('Z', 'GMT'), '%Y-%m-%dT%H:%M:%S.%f%Z')) # print(timestamp) # Shift timestamp to next whole hour timestamp = (timestamp - (timestamp % 3600) + 3600) * NANOS_FACTOR # d = datetime.utcfromtimestamp(timestamp) # print('-> %s' % d.isoformat()) return timestamp # def next_whole_hour_from_date(self, date): # date_s = self.query_db('SELECT FIRST(calibrated), time FROM %s' % measurement)[0]['time'] # return parser.parse(date_s) def read(self, packet): measurement_info = self.next_measurement_info() current_ts_nanos = measurement_info['current_ts'] current_ts_secs = current_ts_nanos / NANOS_FACTOR query = self.query % (measurement_info['name'], current_ts_nanos, current_ts_nanos) data = self.query_db(query) if len(data) >= 1: d = datetime.utcfromtimestamp(current_ts_secs) day = '%d%d%d' % (d.year, d.month, d.day) hour = '%d' % (d.hour + 1) # DEBUG: store only first and last of hour-series # data_first = data[0] # data_last = data[len(data)-1] data_o = data data = [] for i in range(0, 24): data.append(data_o[i]) # data.append(data_first) # data.append(data_last) packet.data = self.format_data(measurement_info['device_id'], day, hour, data) # Shift time an hour for this device current_ts_nanos = (current_ts_secs + 3600) * NANOS_FACTOR if current_ts_nanos > measurement_info['end_ts']: # all done for current measurement/device self.del_measurement_info() else: # Shift to next hour for this measurement measurement_info['current_ts'] = current_ts_nanos return packet # Create a data record for timeseries of current device/day/hour def format_data(self, device_id, day, hour, data): # # -- Map this to # CREATE TABLE smartem_raw.timeseries ( # gid serial, # unique_id character varying (16), # insert_time timestamp with time zone default current_timestamp, # device_id integer, # day integer, # hour integer, # data json, # complete boolean default false, # PRIMARY KEY (gid) # ); # Create record with JSON text blob with metadata record = dict() record['unique_id'] = '%s-%s-%s' % (device_id, day, hour) # Timestamp of sample record['device_id'] = device_id record['day'] = day record['hour'] = hour # Optional prefix for each param, usually sensor-box type e.g. "ase_" if self.data_param_prefix: for data_elm in data: keys = data_elm.keys() # https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary for key in keys: data_elm[self.data_param_prefix + key] = data_elm.pop(key) # Add JSON text blob record['data'] = json.dumps({ 'id': device_id, 'date': day, 'hour': hour, 'timeseries': data }) return record