class LmlApacheDirInput(ApacheDirInput): """ RIVM LML version for ApacheDirInput: adds check for each file if it is already in DB. """ def __init__(self, configdict, section, produces=FORMAT.record): ApacheDirInput.__init__(self, configdict, section, produces) self.query = self.cfg.get('query') self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL ApacheDirInput.init(self) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def no_more_files(self): return self.file_index == len(self.file_list) - 1 def filter_file(self, file_name): """ Filter the file_name, e.g. to suppress reading if already present in DB. :param file_name: :return string or None: """ if file_name is None or file_name == 'actueel.xml': return None # Populate and execute SELECT query for file_name query = self.query % file_name rowcount = self.db.execute(query) if rowcount > 0: log.info('file %s already present' % file_name) return None # Not yet present return file_name
class RawSensorInput(HttpInput): """ Raw Sensor REST API (CityGIS) version for HttpInput: adds check for each file if it is already in DB. """ def __init__(self, configdict, section, produces=FORMAT.record): HttpInput.__init__(self, configdict, section, produces) self.query = self.cfg.get('query') self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # Let superclass read file list from Apache URL HttpInput.init(self) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def no_more_files(self): return self.file_index == len(self.file_list) - 1 def filter_file(self, file_name): """ Filter the file_name, e.g. to suppress reading if already present in DB. :param file_name: :return string or None: """ if file_name is None or file_name == 'actueel.xml': return None # Populate and execute SELECT query for file_name query = self.query % file_name rowcount = self.db.execute(query) if rowcount > 0: log.info('file %s already present' % file_name) return None # Not yet present return file_name
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % ( self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
class PostgresDbInput(SqlDbInput): """ Input by querying records from a Postgres database. Input is a query, like SELECT * from mytable. Output is zero or more records as record array (array of dict) or single record (dict). produces=FORMAT.record_array (default) or FORMAT.record """ # Start attribute config meta @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass # End attribute config meta def __init__(self, configdict, section): SqlDbInput.__init__(self, configdict, section) self.db = None def init_columns(self): if self.columns is not None: # Already initialized, reset columns_names to re-init return if self.column_names is None: # If no explicit column names given, get all columns from DB meta info self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema')) else: # Columns provided: make list self.columns = self.column_names.split(',') def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() self.init_columns() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def raw_query(self, query_str): self.init_columns() self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records
class PostgresDbInput(SqlDbInput): """ Input by querying records from a Postgres database. Input is a query, like SELECT * from mytable. Output is zero or more records as record array (array of dict) or single record (dict). produces=FORMAT.record_array (default) or FORMAT.record """ # Start attribute config meta @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to `'5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass # End attribute config meta def __init__(self, configdict, section): SqlDbInput.__init__(self, configdict, section, produces=[FORMAT.record_array, FORMAT.record]) self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() # If no explicit column names given, get from DB meta info self.columns = self.column_names if self.column_names is None: self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema')) def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def raw_query(self, query_str): self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join([ "%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % (self.cfg.get( 'table'), ",".join(['%s ' % k for k in record]), ",".join([ "%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
class WeewxDbInput(SqliteDbInput): """ Reads weewx raw archive records from SQLite. """ def __init__(self, configdict, section): SqliteDbInput.__init__(self, configdict, section) self.progress_query = self.cfg.get('progress_query') self.progress_update = self.cfg.get('progress_update') # Connect only once to DB log.info('Init: connect to Postgres DB') self.progress_db = PostGIS(self.cfg.get_dict()) self.progress_db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.progress_db.disconnect() def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """ # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S') ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id)) log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local)) self.progress_db.execute(self.progress_update % (self.last_id, ts_local)) self.progress_db.commit(close=False) log.info('Update progress table ok') return True def read(self, packet): # Get last processed id of archive table self.progress_db.execute(self.progress_query) progress_rec = self.progress_db.cursor.fetchone() self.last_id = progress_rec[3] log.info('progress record: %s' % str(progress_rec)) # Fetch next batch of archive records archive_recs = self.do_query(self.query % self.last_id) log.info('read archive_recs: %d' % len(archive_recs)) # No more records to process? if len(archive_recs) == 0: packet.set_end_of_stream() log.info('Nothing to do. All file_records done') return packet # Remember last id processed for next query self.last_id = archive_recs[len(archive_recs)-1].get('dateTime') packet.data = archive_recs # Always stop after batch, otherwise we would continue forever packet.set_end_of_stream() return packet
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting single record into Postgres database. Input is a record (Python dic structure) or a Python list of dicts (records). Creates an INSERT for Postgres to insert each single record. consumes=FORMAT.record """ def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.db = None self.key = self.cfg.get('key') def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys()))) log.info('query is %s', query) return query def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create query once if self.query is None: self.query = self.create_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.db.execute(self.query, record.values()) self.db.commit(close=False) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.db.execute(self.query, rec.values()) self.db.commit(close=False) log.info('committed %d records' % len(record)) return packet