Exemple #1
0
class LmlApacheDirInput(ApacheDirInput):
    """
    RIVM LML version for ApacheDirInput: adds check for each file if it is already in DB.
    """
    def __init__(self, configdict, section, produces=FORMAT.record):
        ApacheDirInput.__init__(self, configdict, section, produces)
        self.query = self.cfg.get('query')
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        ApacheDirInput.init(self)

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def no_more_files(self):
        return self.file_index == len(self.file_list) - 1

    def filter_file(self, file_name):
        """
        Filter the file_name, e.g. to suppress reading if already present in DB.
        :param file_name:
        :return string or None:
        """
        if file_name is None or file_name == 'actueel.xml':
            return None

        # Populate and execute SELECT query for file_name
        query = self.query % file_name
        rowcount = self.db.execute(query)
        if rowcount > 0:
            log.info('file %s already present' % file_name)
            return None

        # Not yet present
        return file_name
class RawSensorInput(HttpInput):
    """
    Raw Sensor REST API (CityGIS) version for HttpInput: adds check for each file if it is already in DB.
    """
    def __init__(self, configdict, section, produces=FORMAT.record):
        HttpInput.__init__(self, configdict, section, produces)
        self.query = self.cfg.get('query')
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        HttpInput.init(self)

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def no_more_files(self):
        return self.file_index == len(self.file_list) - 1

    def filter_file(self, file_name):
        """
        Filter the file_name, e.g. to suppress reading if already present in DB.
        :param file_name:
        :return string or None:
        """
        if file_name is None or file_name == 'actueel.xml':
            return None

        # Populate and execute SELECT query for file_name
        query = self.query % file_name
        rowcount = self.db.execute(query)
        if rowcount > 0:
            log.info('file %s already present' % file_name)
            return None

        # Not yet present
        return file_name
Exemple #3
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (
            self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet
Exemple #4
0
class PostgresDbInput(SqlDbInput):
    """
    Input by querying records from a Postgres database.
    Input is a query, like SELECT * from mytable.
    Output is zero or more records as record array (array of dict) or single record (dict).

    produces=FORMAT.record_array (default) or FORMAT.record
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        SqlDbInput.__init__(self, configdict, section)
        self.db = None

    def init_columns(self):
        if self.columns is not None:
            # Already initialized, reset columns_names to re-init
            return

        if self.column_names is None:
            # If no explicit column names given, get all columns from DB meta info
            self.columns = self.db.get_column_names(self.cfg.get('table'),
                                                    self.cfg.get('schema'))
        else:
            # Columns provided: make list
            self.columns = self.column_names.split(',')

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()
        self.init_columns()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')

        self.db.disconnect()

    def raw_query(self, query_str):
        self.init_columns()

        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records
Exemple #5
0
class PostgresDbInput(SqlDbInput):
    """
    Input by querying records from a Postgres database.
    Input is a query, like SELECT * from mytable.
    Output is zero or more records as record array (array of dict) or single record (dict).

    produces=FORMAT.record_array (default) or FORMAT.record
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to `'5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass
    # End attribute config meta

    def __init__(self, configdict, section):
        SqlDbInput.__init__(self, configdict, section, produces=[FORMAT.record_array, FORMAT.record])
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # If no explicit column names given, get from DB meta info
        self.columns = self.column_names
        if self.column_names is None:
            self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema'))

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')

        self.db.disconnect()

    def raw_query(self, query_str):

        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records
Exemple #6
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self,
                          configdict,
                          section,
                          consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k
                                             for k in record]), ",".join([
                                                 "%s",
                                             ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (self.cfg.get(
            'table'), ",".join(['%s ' % k for k in record]), ",".join([
                "%s",
            ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet
class WeewxDbInput(SqliteDbInput):
    """
    Reads weewx raw archive records from SQLite.
    """
    def __init__(self, configdict, section):
        SqliteDbInput.__init__(self, configdict, section)
        self.progress_query = self.cfg.get('progress_query')
        self.progress_update = self.cfg.get('progress_update')

        # Connect only once to DB
        log.info('Init: connect to Postgres DB')
        self.progress_db = PostGIS(self.cfg.get_dict())
        self.progress_db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.progress_db.disconnect()

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
        # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S')
        ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id))

        log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local))
        self.progress_db.execute(self.progress_update % (self.last_id, ts_local))
        self.progress_db.commit(close=False)
        log.info('Update progress table ok')
        return True

    def read(self, packet):

        # Get last processed id of archive table
        self.progress_db.execute(self.progress_query)
        progress_rec = self.progress_db.cursor.fetchone()
        self.last_id = progress_rec[3]
        log.info('progress record: %s' % str(progress_rec))

        # Fetch next batch of archive records
        archive_recs = self.do_query(self.query % self.last_id)

        log.info('read archive_recs: %d' % len(archive_recs))

        # No more records to process?
        if len(archive_recs) == 0:
            packet.set_end_of_stream()
            log.info('Nothing to do. All file_records done')
            return packet

         # Remember last id processed for next query
        self.last_id = archive_recs[len(archive_recs)-1].get('dateTime')

        packet.data = archive_recs

        # Always stop after batch, otherwise we would continue forever
        packet.set_end_of_stream()

        return packet
Exemple #8
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting single record into Postgres database.
    Input is a record (Python dic structure) or a Python list of dicts (records).
    Creates an INSERT for Postgres to insert each single record.

    consumes=FORMAT.record
    """

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.db = None
        self.key = self.cfg.get('key')

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys())))
        log.info('query is %s', query)
        return query

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create query once
        if self.query is None:
            self.query = self.create_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.db.execute(self.query, record.values())
            self.db.commit(close=False)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
                # Multiple records in list
                for rec in record:
                    # Do insert with values from the record
                    self.db.execute(self.query, rec.values())
                    self.db.commit(close=False)

                log.info('committed %d records' % len(record))

        return packet