Ejemplos de PostGIS en Python, ejemplos de stetl.postgis.PostGIS en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: rawsensorinput.py Proyecto: RobertoTjesse/sospilot

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        HttpInput.init(self)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: gijs/stetl

 def pg_srs_constraint(self):
     log.info('set srs constraint')
     db = PostGIS(self.cfg.get_dict())
     srid = self.srid
     sql = "ALTER TABLE gml_objects DROP CONSTRAINT enforce_srid_gml_bounded_by;"
     db.tx_execute(sql)
     sql = "ALTER TABLE gml_objects ADD CONSTRAINT enforce_srid_gml_bounded_by CHECK  (st_srid(gml_bounded_by) = (%s));" % srid
     db.tx_execute(sql)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: dboutput.py Proyecto: fsteggink/stetl

    def write(self, packet):
        if packet.data is None:
            return packet

        log.info('executing SQL')
        db = PostGIS(self.cfg.get_dict())
        rowcount = db.tx_execute(packet.data)
        log.info('executed SQL, rowcount=%d' % rowcount)
        return packet

Ejemplo n.º 4

0

Mostrar archivo

    def write(self, packet):
        if packet.data is None:
            return packet

        log.info('executing SQL')
        db = PostGIS(self.cfg.get_dict())
        rowcount = db.tx_execute(packet.data)
        log.info('executed SQL, rowcount=%d' % rowcount)
        return packet

Ejemplo n.º 5

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: gijs/stetl

 def get_feature_types(self):
     log.info('reading all featuretypes from DB')
     db = PostGIS(self.cfg.get_dict())
     db.connect()
     sql = "SELECT id,qname FROM feature_types"
     db.execute(sql)
     cur = db.cursor
     for record in cur:
         self.feature_type_ids[record[1]] = record[0]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: weewxdbinput.py Proyecto: RobertoTjesse/sospilot

    def __init__(self, configdict, section):
        SqliteDbInput.__init__(self, configdict, section)
        self.progress_query = self.cfg.get('progress_query')
        self.progress_update = self.cfg.get('progress_update')

        # Connect only once to DB
        log.info('Init: connect to Postgres DB')
        self.progress_db = PostGIS(self.cfg.get_dict())
        self.progress_db.connect()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: dbinput.py Proyecto: gitter-badger/stetl

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # If no explicit column names given, get from DB meta info
        self.columns = self.column_names
        if self.column_names is None:
            self.columns = self.db.get_column_names(self.cfg.get('table'),
                                                    self.cfg.get('schema'))

Ejemplo n.º 8

0

Mostrar archivo

Archivo: apachedirinput.py Proyecto: Geonovum/sospilot

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        ApacheDirInput.init(self)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: apachedirinput.py Proyecto: Geonovum/sospilot

class LmlApacheDirInput(ApacheDirInput):
    """
    RIVM LML version for ApacheDirInput: adds check for each file if it is already in DB.
    """
    def __init__(self, configdict, section, produces=FORMAT.record):
        ApacheDirInput.__init__(self, configdict, section, produces)
        self.query = self.cfg.get('query')
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        ApacheDirInput.init(self)

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def no_more_files(self):
        return self.file_index == len(self.file_list) - 1

    def filter_file(self, file_name):
        """
        Filter the file_name, e.g. to suppress reading if already present in DB.
        :param file_name:
        :return string or None:
        """
        if file_name is None or file_name == 'actueel.xml':
            return None

        # Populate and execute SELECT query for file_name
        query = self.query % file_name
        rowcount = self.db.execute(query)
        if rowcount > 0:
            log.info('file %s already present' % file_name)
            return None

        # Not yet present
        return file_name

Ejemplo n.º 10

0

Mostrar archivo

Archivo: rawsensorinput.py Proyecto: RobertoTjesse/sospilot

class RawSensorInput(HttpInput):
    """
    Raw Sensor REST API (CityGIS) version for HttpInput: adds check for each file if it is already in DB.
    """
    def __init__(self, configdict, section, produces=FORMAT.record):
        HttpInput.__init__(self, configdict, section, produces)
        self.query = self.cfg.get('query')
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # Let superclass read file list from Apache URL
        HttpInput.init(self)

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def no_more_files(self):
        return self.file_index == len(self.file_list) - 1

    def filter_file(self, file_name):
        """
        Filter the file_name, e.g. to suppress reading if already present in DB.
        :param file_name:
        :return string or None:
        """
        if file_name is None or file_name == 'actueel.xml':
            return None

        # Populate and execute SELECT query for file_name
        query = self.query % file_name
        rowcount = self.db.execute(query)
        if rowcount > 0:
            log.info('file %s already present' % file_name)
            return None

        # Not yet present
        return file_name

Ejemplo n.º 11

0

Mostrar archivo

Archivo: dbinput.py Proyecto: dracic/stetl

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # If no explicit column names given, get from DB meta info
        self.columns = self.column_names
        if self.column_names is None:
            self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema'))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: josene.py Proyecto: tvsltd/docker-se-stetl

    def init(self, config_dict):

        self.config_dict = config_dict
        self.process_name = config_dict['process_name']
        self.db = PostGIS(config_dict)
        self.db.connect()

        ids = dict()
        parameters = dict()
        models = dict()
        state = dict()

        # Query ANN Calibration Model and its State from DB for each calibrated sensor.
        if self.model_query is not None and len(self.sensor_model_names) > 0:
            log.info('Getting calibration models and state from database')
            for k in self.sensor_model_names:
                v = self.sensor_model_names[k]
                id, param, model = self.query_model(v)
                ids[k] = id
                parameters[k] = param
                models[k] = model

                model_state = self.query_state(id)
                state[k] = model_state

        else:
            log.info('No query for fetching calibration models given or no '
                     'mapping for calibration models to gas components given.')

        # Put Model and State info in the Device definitions.
        for k in ids:
            SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k]
        for k in parameters:
            SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k]
        for k in models:
            SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k]
        for k, v in state.iteritems():
            for device_id, device_state in v.iteritems():
                for gas, state in device_state.iteritems():
                    v[device_id][gas] = RunningMean.from_dict(state)
            SENSOR_DEFS[k]['converter_model']['state'] = v

Ejemplo n.º 13

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: thijsbrentjens/stetl

 def pg_srs_constraint(self):
     log.info('set srs constraint')
     db = PostGIS(self.cfg.get_dict())
     srid = self.srid
     sql = "ALTER TABLE gml_objects DROP CONSTRAINT enforce_srid_gml_bounded_by;"
     db.tx_execute(sql)
     sql = "ALTER TABLE gml_objects ADD CONSTRAINT enforce_srid_gml_bounded_by CHECK  (st_srid(gml_bounded_by) = (%s));" % srid
     db.tx_execute(sql)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: thijsbrentjens/stetl

 def get_feature_types(self):
     log.info('reading all featuretypes from DB')
     db = PostGIS(self.cfg.get_dict())
     db.connect()
     sql = "SELECT id,qname FROM feature_types"
     db.execute(sql)
     cur = db.cursor
     for record in cur:
         self.feature_type_ids[record[1]] = record[0]

Ejemplo n.º 15

0

Mostrar archivo

class DeegreeBlobstoreInput(Input):
    """
    Read features from deegree Blobstore DB into an etree doc.

    produces=FORMAT.etree_doc
    """

    # Start attribute config meta

    @Config(ptype=int, required=False, default=10000)
    def max_features_per_doc(self):
        """
        Max features to read from input feature GML stream per internal document.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def start_container(self):
        """
        Tag that starts container.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def end_container(self):
        """
        Tag that ends container.
        """
        pass

    @Config(ptype=str, required=False, default=False)
    def start_feature_tag(self):
        """
        XML tag that starts Feature.
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def end_feature_tag(self):
        """
        XML tag that ends Feature.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        Input.__init__(self, configdict, section, produces=FORMAT.etree_doc)
        self.cur_feature_blob = None
        self.rowcount = 0

        # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/
        self.regex_xlink_href = re.compile(
            "\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))"
        )

        self.db = None
        self.xlink_db = None
        self.buffer = None
        self.feature_count = 0
        # Reusable XML parser
        self.xml_parser = etree.XMLParser(remove_blank_text=True)

    def init(self):
        pass

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

    def write_feature(self, record):
        feature_blob = str(record[0])

        # Write start-tag, blob element, end-tag
        self.buffer.write(self.start_feature_tag)
        self.buffer.write(feature_blob)
        self.buffer.write(self.end_feature_tag)
        self.feature_count += 1
        return feature_blob

    def init_buf(self):
        buffer = StringIO()
        buffer = codecs.getwriter("utf8")(buffer)
        self.feature_count = 0
        self.xlink_ids = set()
        return buffer

    def buffer_to_doc(self, packet):
        # Process/transform data in buffer
        self.buffer.seek(0)
        try:
            packet.data = etree.parse(self.buffer, self.xml_parser)
        except Exception as e:
            bufStr = self.buffer.getvalue()
            if not bufStr:
                log.info("parse buffer empty: content=[%s]" % bufStr)
            else:
                log.error("error in buffer parsing %s" % str(e))
                raise
        self.buffer.close()
        self.buffer = None

Ejemplo n.º 16

0

Mostrar archivo

 def init(self):
     # Connect only once to DB
     log.info('Init: connect to DB')
     self.db = PostGIS(self.cfg.get_dict())
     self.db.connect()
     self.init_columns()

Ejemplo n.º 17

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: gijs/stetl

    def write(self, packet):
        if packet.data is None:
            return packet

        gml_doc = packet.data
        log.info('inserting features in DB')
        db = PostGIS(self.cfg.get_dict())
        db.connect()
        #        print self.to_string(gml_doc, False, False)
        #        NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'}
        #        featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS)
        featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag)
        count = 0
        gml_ns = None
        for childNode in featureMembers:
            if gml_ns is None:
                if childNode.nsmap.has_key('gml'):
                    gml_ns = childNode.nsmap['gml']
                else:
                    if childNode.nsmap.has_key('GML'):
                        gml_ns = childNode.nsmap['GML']

            gml_id = childNode.get('{%s}id' % gml_ns)

            feature_type_id = self.feature_type_ids[childNode.tag]

            # Find a GML geometry in the GML NS
            ogrGeomWKT = None
            #            gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS)
            gmlMembers = childNode.xpath(
                ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']")
            geom_str = None
            for gmlMember in gmlMembers:
                if geom_str is None:
                    geom_str = etree.tostring(gmlMember)
                #                   no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT
            #                    ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr))
            #                    if ogrGeom is not None:
            #                        ogrGeomWKT = ogrGeom.ExportToWkt()
            #                        if ogrGeomWKT is not None:
            #                            break

            blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8')

            if geom_str is None:
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob))
            else:
                # ST_SetSRID(ST_GeomFromGML(%s)),-1)
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid)

            if db.execute(sql, parameters) == -1:
                log.error("feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob))

                # will fail but we will close connection also
                db.commit()

                # proceed...
                log.info('retrying to proceed with remaining features...')
                db = PostGIS(self.cfg.get_dict())
                db.connect()
                count = 0

            count += 1

        exception = db.commit()
        if exception is not None:
            log.error("error in commit")

        log.info("inserted %s features" % count)
        return packet

Ejemplo n.º 18

0

Mostrar archivo

    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # One time: get all device ids
        self.fetch_devices()

Ejemplo n.º 19

0

Mostrar archivo

class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self,
                          configdict,
                          section,
                          consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k
                                             for k in record]), ",".join([
                                                 "%s",
                                             ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (self.cfg.get(
            'table'), ",".join(['%s ' % k for k in record]), ",".join([
                "%s",
            ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet

Ejemplo n.º 20

0

Mostrar archivo

Archivo: progresstracker.py Proyecto: tvsltd/docker-se-stetl

 def init(self):
     self.db = PostGIS(self.cfg.get_dict())
     self.db.connect()

Ejemplo n.º 21

0

Mostrar archivo

    def init(self):
        InfluxDbInput.init(self)

        # PostGIS for tracking Harvesting progress.
        # Tracking is automatically updated via a TRIGGER (see db-schema-raw).
        postgis_cfg = {
            'host': self.pg_host,
            'port': self.pg_port,
            'database': self.pg_database,
            'user': self.pg_user,
            'password': self.pg_password,
            'schema': self.pg_schema
        }
        self.tracking_db = PostGIS(postgis_cfg)
        self.tracking_db.connect()

        # One time: get all measurements and related info and store in structure
        measurements = self.get_measurement_names()
        for measurement in measurements:
            # Optional mapping from MEASUREMENT name to a device id
            # Otherwise device_is is Measurement name
            device_id = measurement
            if self.meas_name_to_device_id:
                if measurement not in self.meas_name_to_device_id:
                    log.warn('No device_id mapped for measurement (table) %s' %
                             measurement)
                    continue

                device_id = self.meas_name_to_device_id[measurement]

            date_start_s, start_ts = self.get_start_time(measurement)
            date_end_s, end_ts = self.get_end_time(measurement)
            start_ts = self.date_str_to_whole_hour_nanos(date_start_s)
            end_ts *= NANOS_FACTOR

            # Shift time for current_ts from progress table if already in progress
            # otherwise use start time of measurement.
            current_ts = start_ts
            row_count = self.tracking_db.execute(self.progress_query +
                                                 device_id)
            if row_count > 0:
                # Already in progress
                progress_rec = self.tracking_db.cursor.fetchone()
                ymd_last = str(progress_rec[4])
                year_last = ymd_last[0:4]
                month_last = ymd_last[4:6]
                day_last = ymd_last[6:]
                hour_last = progress_rec[5]
                # e.g. 2017-11-17T11:00:00.411Z
                date_str = '%s-%s-%sT%d:00:00.000Z' % (year_last, month_last,
                                                       day_last, hour_last - 1)
                current_ts = self.date_str_to_whole_hour_nanos(date_str)
                # skip to next hour
                # current_ts += (3600 * NANOS_FACTOR)

            # Store all info per device (measurement table) in list of dict
            self.measurements_info.append({
                'name': measurement,
                'date_start_s': date_start_s,
                'start_ts': start_ts,
                'date_end_s': date_end_s,
                'end_ts': end_ts,
                'current_ts': current_ts,
                'device_id': device_id
            })

        print("measurements_info: %s" % str(self.measurements_info))

Ejemplo n.º 22

0

Mostrar archivo

class HarvesterInfluxDbInput(InfluxDbInput):
    """
    InfluxDB TimeSeries (History) fetcher/formatter.

    Fetching all timeseries data from InfluxDB and putting
    these unaltered into recods e.g. for storing later in Postgres DB. This is a continuous process.
    Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are
    in harvesting.

    Algorithm:

        * fetch all Measurements (table names)
        * for each Measurement:
        * if Measurement (name) is not in progress-table insert and set day,hour to 0
        * if in progress-table fetch entry (day, hour)
        * get timeseries (hours) available for that day
        * fetch and store each, starting with the last hour previously stored
        * ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process)
        * stored entry: measurement, day, hour, json blob
        * finish: when all done or when max_proc_time_secs passed
        
    """
    @Config(ptype=int, default=None, required=True)
    def max_proc_time_secs(self):
        """
        The maximum time in seconds we should continue processing input.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def device_type(self):
        """
        The station/device type, e.g. 'ase'.

        Required: False

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def device_version(self):
        """
        The station/device version, e.g. '1'.

        Required: False

        Default: None
        """
        pass

    @Config(ptype=dict, default=None, required=False)
    def meas_name_to_device_id(self):
        """
        How to map InfluxDB Measurement (table) names to SE device id's.
        e.g. {'Geonovum1' : '1181001', 'RIVM2' : '1181002'}

        Required: False

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def progress_table(self):
        """
        The Postgres table tracking all last processed days/hours for each device.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, required=False, default='localhost')
    def pg_host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def pg_port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=True)
    def pg_database(self):
        """
        database name
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def pg_user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def pg_password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def pg_schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    def __init__(self, configdict, section):
        InfluxDbInput.__init__(self, configdict, section)
        self.current_time_secs = lambda: int(round(time.time()))
        self.start_time_secs = self.current_time_secs()
        self.progress_query = "SELECT * from %s where device_id=" % self.progress_table
        self.measurements_info = []
        self.index_m = -1
        self.query = "SELECT * FROM %s WHERE time >= %d AND time < %d + 1h"
        self.tracking_db = None

    def init(self):
        InfluxDbInput.init(self)

        # PostGIS for tracking Harvesting progress.
        # Tracking is automatically updated via a TRIGGER (see db-schema-raw).
        postgis_cfg = {
            'host': self.pg_host,
            'port': self.pg_port,
            'database': self.pg_database,
            'user': self.pg_user,
            'password': self.pg_password,
            'schema': self.pg_schema
        }
        self.tracking_db = PostGIS(postgis_cfg)
        self.tracking_db.connect()

        # One time: get all measurements and related info and store in structure
        measurements = self.get_measurement_names()
        for measurement in measurements:
            # Optional mapping from MEASUREMENT name to a device id
            # Otherwise device_is is Measurement name
            device_id = measurement
            if self.meas_name_to_device_id:
                if measurement not in self.meas_name_to_device_id:
                    log.warn('No device_id mapped for measurement (table) %s' %
                             measurement)
                    continue

                device_id = self.meas_name_to_device_id[measurement]

            date_start_s, start_ts = self.get_start_time(measurement)
            date_end_s, end_ts = self.get_end_time(measurement)
            start_ts = self.date_str_to_whole_hour_nanos(date_start_s)
            end_ts *= NANOS_FACTOR

            # Shift time for current_ts from progress table if already in progress
            # otherwise use start time of measurement.
            current_ts = start_ts
            row_count = self.tracking_db.execute(self.progress_query +
                                                 device_id)
            if row_count > 0:
                # Already in progress
                progress_rec = self.tracking_db.cursor.fetchone()
                ymd_last = str(progress_rec[4])
                year_last = ymd_last[0:4]
                month_last = ymd_last[4:6]
                day_last = ymd_last[6:]
                hour_last = progress_rec[5]
                # e.g. 2017-11-17T11:00:00.411Z
                date_str = '%s-%s-%sT%d:00:00.000Z' % (year_last, month_last,
                                                       day_last, hour_last - 1)
                current_ts = self.date_str_to_whole_hour_nanos(date_str)
                # skip to next hour
                # current_ts += (3600 * NANOS_FACTOR)

            # Store all info per device (measurement table) in list of dict
            self.measurements_info.append({
                'name': measurement,
                'date_start_s': date_start_s,
                'start_ts': start_ts,
                'date_end_s': date_end_s,
                'end_ts': end_ts,
                'current_ts': current_ts,
                'device_id': device_id
            })

        print("measurements_info: %s" % str(self.measurements_info))

    def all_done(self):
        return len(self.measurements_info) == 0

    def has_expired(self):
        if (self.current_time_secs() -
                self.start_time_secs) > self.max_proc_time_secs:
            return True
        return False

    def next_measurement_info(self):
        self.index_m += 1
        return self.measurements_info[self.index_m %
                                      len(self.measurements_info)]

    def del_measurement_info(self):
        if not self.all_done():
            del self.measurements_info[self.index_m %
                                       len(self.measurements_info)]

    def before_invoke(self, packet):
        if self.has_expired() or self.all_done():
            # All devices read or timer expiry
            log.info('Processing halted: expired or all done')
            packet.set_end_of_stream()
            return False

    # def next_whole_hour_from_date(self, date):
    #     date_s = self.query_db('SELECT FIRST(calibrated), time FROM %s' % measurement)[0]['time']
    #     return parser.parse(date_s)

    def date_str_to_whole_hour_nanos(self, date_str):
        """
        COnvert URZ date time string to timestamp nanos on whole hour.
        :param date_str:
        :return:
        """
        timestamp = self.date_str_to_ts_nanos(date_str)

        # print(timestamp)
        # Shift timestamp to next whole hour
        timestamp = (timestamp - (timestamp % 3600)) * NANOS_FACTOR
        # d = datetime.utcfromtimestamp(timestamp)
        # print('-> %s' % d.isoformat())
        return timestamp

    def read(self, packet):
        measurement_info = self.next_measurement_info()

        current_ts_nanos = measurement_info['current_ts']
        current_ts_secs = current_ts_nanos / NANOS_FACTOR
        query = self.query % (measurement_info['name'], current_ts_nanos,
                              current_ts_nanos)
        data = self.query_db(query)

        if len(data) >= 1:
            d = datetime.utcfromtimestamp(current_ts_secs)
            day = d.strftime('%Y%m%d')
            hour = str(d.hour + 1).zfill(2)

            # DEBUG: store only first and last of hour-series
            data_first = {'time': data[0]['time']}
            data_last = {'time': data[len(data) - 1]['time']}
            # data_o = data
            # data = [data_first, data_last]
            # for i in range(0,4):
            #     data.append(data_o[i])
            record = self.format_data(measurement_info['device_id'], day, hour,
                                      data)
            packet.data = None
            if record['complete']:
                packet.data = record

        # Shift time an hour for this device
        current_ts_nanos = (current_ts_secs + 3600) * NANOS_FACTOR
        if current_ts_nanos > measurement_info['end_ts']:
            # all done for current measurement/device
            self.del_measurement_info()
        else:
            # Shift to next hour for this measurement
            measurement_info['current_ts'] = current_ts_nanos

        return packet

    # Create a data record for timeseries of current device/day/hour
    def format_data(self, device_id, day, hour, data):
        #
        # -- Map this to
        # CREATE TABLE smartem_raw.timeseries (
        #   gid serial,
        #   unique_id character varying (16),
        #   insert_time timestamp with time zone default current_timestamp,
        #   device_id integer,
        #   day integer,
        #   hour integer,
        #   data json,
        #   complete boolean default false,
        #   PRIMARY KEY (gid)
        # );

        # Create record with JSON text blob with metadata
        record = dict()
        record['unique_id'] = '%s-%s-%s' % (device_id, day, hour)

        # Timestamp of sample
        record['device_id'] = device_id
        record['device_type'] = self.device_type
        record['device_version'] = self.device_version
        record['day'] = day
        record['hour'] = hour

        # Determine if hour is "complete"
        record['complete'] = False
        d = datetime.utcfromtimestamp(self.current_time_secs())
        cur_day = int(d.strftime('%Y%m%d'))
        cur_hour = d.hour + 1
        if cur_day > int(day) \
                or (cur_day == int(day) and cur_hour > int(hour)):
            record['complete'] = True

        # Optional prefix for each param, usually sensor-box type e.g. "ase_"
        # if self.data_param_prefix:
        #     for data_elm in data:
        #         keys = data_elm.keys()
        #         # https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary
        #         for key in keys:
        #             data_elm[self.data_param_prefix + key] = data_elm.pop(key)

        # Add JSON text blob
        record['data'] = json.dumps({
            'id': device_id,
            'date': day,
            'hour': hour,
            'timeseries': data
        })

        return record

Ejemplo n.º 23

0

Mostrar archivo

Archivo: deegreeinput.py Proyecto: geopython/stetl

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

Ejemplo n.º 24

0

Mostrar archivo

Archivo: deegreeinput.py Proyecto: geopython/stetl

class DeegreeBlobstoreInput(Input):
    """
    Read features from deegree Blobstore DB into an etree doc.

    produces=FORMAT.etree_doc
    """

    # Start attribute config meta

    @Config(ptype=int, required=False, default=10000)
    def max_features_per_doc(self):
        """
        Max features to read from input feature GML stream per internal document.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def start_container(self):
        """
        Tag that starts container.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def end_container(self):
        """
        Tag that ends container.
        """
        pass

    @Config(ptype=str, required=False, default=False)
    def start_feature_tag(self):
        """
        XML tag that starts Feature.
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def end_feature_tag(self):
        """
        XML tag that ends Feature.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        Input.__init__(self, configdict, section, produces=FORMAT.etree_doc)
        self.cur_feature_blob = None
        self.rowcount = 0

        # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/
        self.regex_xlink_href = re.compile("\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))")

        self.db = None
        self.xlink_db = None
        self.buffer = None
        self.feature_count = 0
        # Reusable XML parser
        self.xml_parser = etree.XMLParser(remove_blank_text=True)

    def init(self):
        pass

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

    def write_feature(self, record):
        feature_blob = str(record[0])

        # Write start-tag, blob element, end-tag
        self.buffer.write(self.start_feature_tag)
        self.buffer.write(feature_blob)
        self.buffer.write(self.end_feature_tag)
        self.feature_count += 1
        return feature_blob

    def init_buf(self):
        buffer = StringIO()
        buffer = codecs.getwriter("utf8")(buffer)
        self.feature_count = 0
        self.xlink_ids = set()
        return buffer

    def buffer_to_doc(self, packet):
        # Process/transform data in buffer
        self.buffer.seek(0)
        try:
            packet.data = etree.parse(self.buffer, self.xml_parser)
        except Exception as e:
            bufStr = self.buffer.getvalue()
            if not bufStr:
                log.info("parse buffer empty: content=[%s]" % bufStr)
            else:
                log.error("error in buffer parsing %s" % str(e))
                raise
        self.buffer.close()
        self.buffer = None

Ejemplo n.º 25

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: thijsbrentjens/stetl

    def write(self, packet):
        if packet.data is None:
            return packet

        gml_doc = packet.data
        log.info('inserting features in DB')
        db = PostGIS(self.cfg.get_dict())
        db.connect()
        #        print self.to_string(gml_doc, False, False)
        #        NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'}
        #        featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS)
        featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" %
                                       self.feature_member_tag)
        count = 0
        gml_ns = None
        for childNode in featureMembers:
            if gml_ns is None:
                if childNode.nsmap.has_key('gml'):
                    gml_ns = childNode.nsmap['gml']
                else:
                    if childNode.nsmap.has_key('GML'):
                        gml_ns = childNode.nsmap['GML']

            gml_id = childNode.get('{%s}id' % gml_ns)

            feature_type_id = self.feature_type_ids[childNode.tag]

            # Find a GML geometry in the GML NS
            ogrGeomWKT = None
            #            gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS)
            gmlMembers = childNode.xpath(
                ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']"
            )
            geom_str = None
            for gmlMember in gmlMembers:
                if geom_str is None:
                    geom_str = etree.tostring(gmlMember)
                #                   no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT
            #                    ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr))
            #                    if ogrGeom is not None:
            #                        ogrGeomWKT = ogrGeom.ExportToWkt()
            #                        if ogrGeomWKT is not None:
            #                            break

            blob = etree.tostring(childNode,
                                  pretty_print=False,
                                  xml_declaration=False,
                                  encoding='UTF-8')

            if geom_str is None:
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob))
            else:
                # ST_SetSRID(ST_GeomFromGML(%s)),-1)
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob),
                              geom_str, self.srid)

            if db.execute(sql, parameters) == -1:
                log.error(
                    "feat num# = %d error inserting feature blob=%s (but continuing)"
                    % (count, blob))

                # will fail but we will close connection also
                db.commit()

                # proceed...
                log.info('retrying to proceed with remaining features...')
                db = PostGIS(self.cfg.get_dict())
                db.connect()
                count = 0

            count += 1

        exception = db.commit()
        if exception is not None:
            log.error("error in commit")

        log.info("inserted %s features" % count)
        return packet

Ejemplo n.º 26

0

Mostrar archivo

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

Ejemplo n.º 27

0

Mostrar archivo

Archivo: influxdbinput.py Proyecto: giovibal/smartemission

    def init(self):
        InfluxDbInput.init(self)
        postgis_cfg = {
            'host': self.pg_host,
            'port': self.pg_port,
            'database': self.pg_database,
            'user': self.pg_user,
            'password': self.pg_password,
            'schema': self.pg_schema
        }
        self.db = PostGIS(postgis_cfg)
        self.db.connect()

        # One time: get all measurements and related info and store in structure
        self.measurements = self.query_db('SHOW MEASUREMENTS')
        for measurement in self.measurements:
            measurement_name = measurement['name']
            date_start_s = self.query_db(
                'SELECT FIRST(calibrated), time FROM %s' %
                measurement_name)[0]['time']
            start_ts = self.date_str_to_ts_nanos(date_start_s)
            date_end_s = self.query_db(
                'SELECT LAST(calibrated), time FROM %s' %
                measurement_name)[0]['time']
            end_ts = self.date_str_to_ts_nanos(date_end_s)
            device_id = measurement_name
            if self.meas_name_to_device_id:
                if measurement_name not in self.meas_name_to_device_id:
                    log.error(
                        'No device_id mapped for measurement (table) %s' %
                        measurement_name)
                    raise Exception

                device_id = self.meas_name_to_device_id[measurement_name]

            # Shift time for current_ts from progress table if already in progress
            # otherwise use start time of measurement.
            current_ts = start_ts
            row_count = self.db.execute(self.progress_query + device_id)
            if row_count > 0:
                progress_rec = self.db.cursor.fetchone()
                ymd_last = str(progress_rec[4])
                year_last = ymd_last[0:4]
                month_last = ymd_last[4:6]
                day_last = ymd_last[6:]
                hour_last = progress_rec[5]
                # e.g. 2017-11-17T11:00:00.411Z
                date_str = '%s-%s-%sT%d:00:00.0Z' % (year_last, month_last,
                                                     day_last, hour_last)
                current_ts = self.date_str_to_ts_nanos(date_str)
                # skip to next hour
                current_ts += (3600 * NANOS_FACTOR)

            # Store all info per device (measurement table) in list of dict
            self.measurements_info.append({
                'name': measurement_name,
                'date_start_s': date_start_s,
                'start_ts': start_ts,
                'date_end_s': date_end_s,
                'end_ts': end_ts,
                'current_ts': current_ts,
                'device_id': device_id
            })

        print(str(self.measurements_info))

Ejemplo n.º 28

0

Mostrar archivo

class RawSensorTimeseriesInput(RawSensorAPIInput):
    """
    Raw Sensor REST API (CityGIS and Intemo servers) TimeSeries (History) fetcher/formatter.

    Fetching all timeseries data via the Raw Sensor API (RSA) from CityGIS server and putting
    these unaltered into Postgres DB. This is a continuus process.
    Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are
    in harvesting.

    Algoritm:
    - fetch all (sensor) devices from RSA
    - for each device:
    - if device is not in progress-table insert and set day,hour to 0
    - if in progress-table fetch entry (day, hour)
    - get timeseries (hours) available for that day
    - fetch and store each, starting with the last hour perviously stored
    - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process)
    - stored entry: device_id, day, hour, last_flag, json blob
    - finish: when all done or when max_proc_time_secs passed
    """

    @Config(ptype=int, default=None, required=True)
    def max_proc_time_secs(self):
        """
        The maximum time in seconds we should continue processing input.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def progress_table(self):
        """
        The Postgres table tracking all last processed days/hours for each device.

        Required: True

        Default: None
        """
        pass

    def __init__(self, configdict, section, produces=FORMAT.record_array):
        RawSensorAPIInput.__init__(self, configdict, section, produces)
        
        # keep track of root base REST URL
        self.url = None
        
        self.current_time_secs = lambda: int(round(time.time()))
        self.start_time_secs = self.current_time_secs()

        self.days = []
        self.days_idx = -1
        self.day = -1
        self.day_last = -1

        self.hours = []
        self.hours_idx = -1
        self.hour = -1
        self.hour_last = -1
        self.db = None

        self.progress_query = "SELECT * from %s where device_id=" % self.progress_table

    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # One time: get all device ids
        self.fetch_devices()

        # Pick a first device id
        # self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx)

    def all_done(self):
        if self.device_ids_idx < 0 and self.days_idx < 0 and self.hours_idx < 0:
            return True
        return False

    def has_expired(self):
        if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs:
            return True
        return False

    def fetch_ts_days(self):
        self.days_idx = -1
        self.days = []
        self.day = -1

        if self.device_id < 0:
            return
        
        ts_days_url = self.base_url + '/devices/%d/timeseries' % self.device_id
        log.info('Init: fetching timeseries days list from URL: "%s" ...' % ts_days_url)

        json_str = self.read_from_url(ts_days_url)
        json_obj = self.parse_json_str(json_str)

        # Typical entry is: "/sensors/v1/devices/8/timeseries/20160404"
        # cut of last
        days_raw = json_obj['days']

        row_count = self.db.execute(self.progress_query + str(self.device_id))
        self.day_last = -1
        self.hour_last = -1
        if row_count > 0:
            progress_rec = self.db.cursor.fetchone()
            self.day_last = progress_rec[4]
            self.hour_last = progress_rec[5]

        # Take a subset of all days: namely those still to be processed
        # Always include the last/current day as it may not be complete
        for d in days_raw:
            day = int(d.split('/')[-1])
            if day >= self.day_last:
                self.days.append(day)

        if len(self.days) > 0:
            self.days_idx = 0
            
        log.info('Device: %d, raw days: %d, days=%d, day_last=%d, hour_last=%d' % (self.device_id, len(days_raw), len(self.days), self.day_last, self.hour_last))

    def fetch_ts_hours(self):
        self.hours_idx = -1
        self.hours = []
        self.hour = None
        if self.device_id == -1 or self.day == -1:
            return

        # 2016-10-30 08:12:09,921 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7
        # 2016-10-30 08:12:09,922 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ...
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO 1 processable hours for device 55 day 20161030
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7)
        # 2016-10-30 08:26:59,172 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7
        # 2016-10-30 08:26:59,172 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ...
        # 2016-10-30 08:26:59,807 RawSensorAPI INFO 1 processable hours for device 55 day 20161030
        # 2016-10-30 08:26:59,808 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030/8

        # 2016-10-30 10:37:30,010 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ...
        # 2016-10-30 10:37:30,170 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9
        # 2016-10-30 10:37:30,170 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:37:30,525 RawSensorAPI INFO 1 processable hours for device 71 day 20161030
        # 2016-10-30 10:37:30,525 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-10 (it is still sampling current hour 9)
        # 2016-10-30 10:47:17,095 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9
        # 2016-10-30 10:47:17,095 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:47:17,511 RawSensorAPI INFO 1 processable hours for device 71 day 20161030
        # 2016-10-30 10:47:17,511 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/10
        # 2016-10-30 10:57:12,325 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ...
        # 2016-10-30 10:57:12,524 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=10
        # 2016-10-30 10:57:12,524 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:57:12,952 RawSensorAPI INFO 0 processable hours for device 71 day 20161030

        # 2016-10-30 12:29:11,534 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/11 cur_day=20161030 cur_hour=11
        # 2016-10-30 12:29:13,177 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-12 (it is still sampling current hour 11)

        ts_hours_url = self.base_url + '/devices/%d/timeseries/%d' % (self.device_id, self.day)
        log.info('Init: fetching timeseries hours list from URL: "%s" ...' % ts_hours_url)
        # Set the next "last values" URL for device and increment to next
        json_str = self.read_from_url(ts_hours_url)
        json_obj = self.parse_json_str(json_str)
        hours_all = json_obj['hours']

        # Get the current day and hour in UTC
        current_day, current_hour = self.get_current_day_hour()
        for h in hours_all:
            hour = int(h)
            if self.day > self.day_last or (self.day == self.day_last and hour > self.hour_last):
                if self.day_last == current_day and hour - 1 >= current_hour:
                    # never append the last hour of today
                    log.info('Skip current hour from %d to %d for device %d on day %d' % (hour-1, hour, self.device_id, self.day))
                else:
                    self.hours.append(hour)

        if len(self.hours) > 0:
            self.hours_idx = 0
        log.info('processable hours for device %d day %d: %s' % (self.device_id, self.day, str(self.hours)))

    def next_day(self):
        # All days for current device done? Try next device
        if self.day == -1:
            self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx)

        # If not yet all devices done fetch days current device
        if self.device_id > -1:
            self.fetch_ts_days()
            self.day, self.days_idx = self.next_entry(self.days, self.days_idx)

    def next_hour(self):

        # Pick an hour entry
        self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx)

        while self.hour < 0:

            # Pick a next day entry
            self.day, self.days_idx = self.next_entry(self.days, self.days_idx)

            if self.day < 0:
                self.next_day()

            if self.day > -1:
                self.fetch_ts_hours()

            if self.device_id < 0:
                log.info('Processing all devices done')
                break

            # Pick an hour entry
            self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx)

    def get_current_day_hour(self):
        # Get the current day and hour in UTC
        current_time = time.gmtime()
        current_day = int(time.strftime('%Y%m%d', current_time))
        current_hour = int(time.strftime('%H',current_time))
        return current_day, current_hour

    def before_invoke(self, packet):
        """
        Called just before Component invoke.
        """

        # Try to fill in: should point to next hour timeseries REST URL
        self.url = None

        if self.has_expired() or self.all_done():
            # All devices read or timer expiry
            log.info('Processing halted: expired or all done')
            packet.set_end_of_stream()
            return False

        self.next_hour()

        # Get the current day and hour in UTC
        current_day, current_hour = self.get_current_day_hour()

        # Skip harvesting the current hour as it will not yet be complete, so try the next device, hour
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7)
        skips = 0
        while self.day == current_day and (self.hour - 1) == current_hour and not self.all_done():
            skips += 1
            log.info('Skip #%d: device-day-hour: %d-%d-%d (still sampling current hour %d)' % (skips, self.device_id, self.day, self.hour, current_hour))
            # Force to skip to next device, sometimes we have an even later hour
            self.next_hour()
            # 30.okt.16: Fix for #24 #25 gaps in data: because next_hour() may jump to next device and unconditionally fetch current hour...
            # so fix is to use while loop until a valid hour available or we are all done

        # Still hours?
        if self.hour > 0:
            # The base method read() will fetch self.url until it is set to None
            # <base_url>/devices/14/timeseries/20160603/18
            self.url = self.base_url + '/devices/%d/timeseries/%d/%d' % (self.device_id, self.day, self.hour)
            log.info('self.url = %s cur_day=%d cur_hour=%d' % (self.url, current_day, current_hour))

        if self.device_id < 0:
            log.info('Processing all devices done')
            return True

        # ASSERT : still device(s) to be done get next hour to process
        return True

    # Create a data record for timeseries of current device/day/hour
    def format_data(self, data):

        #
        # -- Map this to
        # CREATE TABLE smartem_raw.timeseries (
        #   gid serial,
        #   unique_id character varying (16),
        #   insert_time timestamp with time zone default current_timestamp,
        #   device_id integer,
        #   day integer,
        #   hour integer,
        #   data json,
        #   complete boolean default false,
        #   PRIMARY KEY (gid)
        # );


        # Create record with JSON text blob with metadata
        record = dict()
        record['unique_id'] = '%d-%d-%d' % (self.device_id, self.day, self.hour)

        # Timestamp of sample
        record['device_id'] = self.device_id
        record['day'] = self.day
        record['hour'] = self.hour

        # Add JSON text blob
        record['data'] = data

        return record

Ejemplo n.º 29

0

Mostrar archivo

Archivo: dboutput.py Proyecto: dracic/stetl

class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting single record into Postgres database.
    Input is a record (Python dic structure) or a Python list of dicts (records).
    Creates an INSERT for Postgres to insert each single record.

    consumes=FORMAT.record
    """

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.db = None
        self.key = self.cfg.get('key')

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys())))
        log.info('query is %s', query)
        return query

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create query once
        if self.query is None:
            self.query = self.create_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.db.execute(self.query, record.values())
            self.db.commit(close=False)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
                # Multiple records in list
                for rec in record:
                    # Do insert with values from the record
                    self.db.execute(self.query, rec.values())
                    self.db.commit(close=False)

                log.info('committed %d records' % len(record))

        return packet

Ejemplo n.º 30

0

Mostrar archivo

Archivo: josene.py Proyecto: tvsltd/docker-se-stetl

class Josene(Device):

    def __init__(self):
        Device.__init__(self, 'jose')
        self.model_query = "SELECT id,parameters,model from calibration_models WHERE predicts = '%s' AND invalid = FALSE ORDER BY timestamp DESC LIMIT 1"
        self.state_query = "SELECT state from calibration_state WHERE process = '%s' AND model_id = %d ORDER BY timestamp DESC LIMIT 1"
        self.state_insert = "INSERT INTO calibration_state (process, model_id, state) VALUES ('%s', %d, '%s')"
        self.sensor_model_names = {
            'co': 'carbon_monoxide__air_',
            'no2': 'nitrogen_dioxide__air_',
            'o3': 'ozone__air_'
        }
        self.config_dict = None

    def init(self, config_dict):

        self.config_dict = config_dict
        self.process_name = config_dict['process_name']
        self.db = PostGIS(config_dict)
        self.db.connect()

        ids = dict()
        parameters = dict()
        models = dict()
        state = dict()

        # Query ANN Calibration Model and its State from DB for each calibrated sensor.
        if self.model_query is not None and len(self.sensor_model_names) > 0:
            log.info('Getting calibration models and state from database')
            for k in self.sensor_model_names:
                v = self.sensor_model_names[k]
                id, param, model = self.query_model(v)
                ids[k] = id
                parameters[k] = param
                models[k] = model

                model_state = self.query_state(id)
                state[k] = model_state

        else:
            log.info('No query for fetching calibration models given or no '
                     'mapping for calibration models to gas components given.')

        # Put Model and State info in the Device definitions.
        for k in ids:
            SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k]
        for k in parameters:
            SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k]
        for k in models:
            SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k]
        for k, v in state.iteritems():
            for device_id, device_state in v.iteritems():
                for gas, state in device_state.iteritems():
                    v[device_id][gas] = RunningMean.from_dict(state)
            SENSOR_DEFS[k]['converter_model']['state'] = v

    def exit(self):
        # Save the calibration state.
        for k in self.sensor_model_names:
            model = SENSOR_DEFS[k]['converter_model']
            self.save_state(model['model_id'], json.dumps(model['state']))

        self.db.commit(close=False)

    def get_sensor_defs(self):
        return SENSOR_DEFS

    def raw_query(self, query_str):
        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records

    def query_model(self, name):
        query = self.model_query % name
        log.info('Getting calibration model with query: %s' % query)
        ret = self.raw_query(query)
        if len(ret) > 0:
            id, parameters, model = ret[0]
            return id, parameters, pickle.loads(model)
        else:
            log.warn("No model found for %s" % name)
            return None, {}, {}

    def query_state(self, model_id):
        query = self.state_query % (self.process_name, model_id)
        log.info('Getting calibration model state with query: %s' % query)
        ret = self.raw_query(query)
        if len(ret) > 0:
            return ret[0][0]
        else:
            log.warn("No state found for model_id=%d" % model_id)
            return {}

    def save_state(self, model_id, state):
        insert_query = self.state_insert % (self.process_name, model_id, state)
        log.info('Inserting calibration model state for process %s model_id=%d' % (self.process_name, model_id))

        ret = self.db.execute(insert_query)
        if ret != 1:
            log.warn('Cannot save state for process %s model_id=%d' % (self.process_name, model_id))

    # Get raw sensor value or list of values
    def get_raw_value(self, name, val_dict):
        val = None
        if type(name) is list:
            name = name[0]
            return self.get_raw_value(name, val_dict)
            # name is list of names
            # for n in name:
            #     if n in val_dict:
            #         if val is None:
            #             val = []
            #         val.append(val_dict[n])
        else:
            # name is single name
            if name in val_dict:
                val = val_dict[name]

        if 'audio' in name:
            # We may have audio encoded in 3 bands
            bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)]
            val = bands[0]

        return val, name

    # Check for valid sensor value
    def check_value(self, name, val_dict, value=None):
        val = None
        if type(name) is list:
            # name is list of names
            for n in name:
                result, reason = self.check_value(n, val_dict, value)
                if result is False:
                    return result, reason
        else:
            # name is single name
            if name not in val_dict and value is None:
                return False, '%s not present' % name
            else:
                if value is not None:
                    val = value
                else:
                    val = val_dict[name]

                if val is None:
                    return False, '%s is None' % name

                if name not in SENSOR_DEFS:
                    return False, '%s not in SENSOR_DEFS' % name

                name_def = SENSOR_DEFS[name]

                # Audio inputs: need to unpack 3 bands and check for decibel vals
                if 'audio' in name:
                    bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)]

                    # determine validity of these 3 bands
                    dbMin = name_def['min']
                    dbMax = name_def['max']
                    err_cnt = 0
                    msg = ''
                    for i in range(0, len(bands)):
                        band_val = bands[i]
                        # accumulate outliers
                        if band_val < dbMin:
                            err_cnt +=1
                            msg += '%s: val(%s) < min(%s)\n' % (name, str(band_val), str(name_def['min']))
                        elif band_val > dbMax:
                            err_cnt +=1
                            msg += '%s: val(%s) > max(%s)\n' % (name, str(band_val), str(name_def['max']))

                    # Only invalid if all bands outside range
                    if err_cnt >= len(bands):
                        return False, msg

                    return True, '%s OK' % name

                if 'min' in name_def and val < name_def['min']:
                    return False, '%s: val(%s) < min(%s)' % (name, str(val), str(name_def['min']))

                if 'max' in name_def and val > name_def['max']:
                    return False, '%s: val(%s) > max(%s)' % (name, str(val), str(name_def['max']))

        return True, '%s OK' % name

    # Get location as lon, lat
    def get_lon_lat(self, val_dict):
        result = (None, None)
        if 's_longitude' in val_dict and 's_latitude' in val_dict:
            lon = SENSOR_DEFS['longitude']['converter'](val_dict['s_longitude'])
            lat = SENSOR_DEFS['latitude']['converter'](val_dict['s_latitude'])

            valid, reason = self.check_value('latitude', val_dict, value=lat)
            if not valid:
                return result

            valid, reason = self.check_value('longitude', val_dict, value=lon)
            if not valid:
                return result

            result = (lon, lat)

        return result

Ejemplo n.º 31

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: gijs/stetl

 def delete_features(self):
     log.info('deleting ALL features in DB')
     db = PostGIS(self.cfg.get_dict())
     db.tx_execute("TRUNCATE gml_objects")

Ejemplo n.º 32

0

Mostrar archivo

Archivo: progresstracker.py Proyecto: tvsltd/docker-se-stetl

class ProgressTracker(Filter):
    """"
    Filter to track progress of a stream of processed records.
    Stores progress (last id, last timestamp etc) in Postgres table.
    """

    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    @Config(ptype=str, required=False, default='progress')
    def table(self):
        """
        Table name, defaults to 'progress'.
        """
        pass

    @Config(ptype=str, required=True)
    def progress_update_query(self):
        """
        Query to update progress

        Required: True

        Default: ""
        """
        pass

    @Config(ptype=str, required=True)
    def id_key(self):
        """
        Key to select id from record array

        Required: True
        """

    @Config(ptype=str, default=None, required=False)
    def name_key(self):
        """
        Key to select name from record array

        Required: True
        """

    def __init__(self, config_dict, section):
        Filter.__init__(self, config_dict, section,
                        consumes=[FORMAT.record_array, FORMAT.record],
                        produces=[FORMAT.record_array, FORMAT.record])
        self.last_ids = None
        self.db = None
        
    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def invoke(self, packet):
        self.last_ids = dict()

        if packet.data is None or packet.is_end_of_doc() or packet.is_end_of_stream():
            log.info("No packet data or end of doc/stream")
            return packet

        record_in = packet.data
        if type(record_in) is not list:
            record_in = [record_in]

        for record in record_in:
            if self.name_key is not None:
                name = record[self.name_key]
            else:
                name = "all"
            if len(record) > 0:
                new = record[self.id_key]
                self.last_ids[name] = max(self.last_ids.get(name, -1), new)

        log.info("Last ids are: %s", str(self.last_ids))

        return packet

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
        for name in self.last_ids:
            param_tuple = (self.last_ids[name], name)
            log.info('Updating progress table with (id=%d, name=%s)' % param_tuple)
            self.db.execute(self.progress_update_query % param_tuple)
            self.db.commit(close=False)
            log.info('Update progress table ok')
        else:
            log.info('No update for progress table')
        
        return True

Ejemplo n.º 33

0

Mostrar archivo

Archivo: deegreeoutput.py Proyecto: thijsbrentjens/stetl

 def delete_features(self):
     log.info('deleting ALL features in DB')
     db = PostGIS(self.cfg.get_dict())
     db.tx_execute("TRUNCATE gml_objects")

Ejemplo n.º 34

0

Mostrar archivo

Archivo: rawsensorapi.py Proyecto: Geonovum/smartemission

    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # One time: get all device ids
        self.fetch_devices()

Ejemplo n.º 35

0

Mostrar archivo

Archivo: dbinput.py Proyecto: dracic/stetl

class PostgresDbInput(SqlDbInput):
    """
    Input by querying records from a Postgres database.
    Input is a query, like SELECT * from mytable.
    Output is zero or more records as record array (array of dict) or single record (dict).

    produces=FORMAT.record_array (default) or FORMAT.record
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to `'5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass
    # End attribute config meta

    def __init__(self, configdict, section):
        SqlDbInput.__init__(self, configdict, section, produces=[FORMAT.record_array, FORMAT.record])
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # If no explicit column names given, get from DB meta info
        self.columns = self.column_names
        if self.column_names is None:
            self.columns = self.db.get_column_names(self.cfg.get('table'), self.cfg.get('schema'))

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')

        self.db.disconnect()

    def raw_query(self, query_str):

        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records

Ejemplo n.º 36

0

Mostrar archivo

Archivo: rawsensorapi.py Proyecto: Geonovum/smartemission

class RawSensorTimeseriesInput(RawSensorAPIInput):
    """
    Raw Sensor REST API (CityGIS) to fetch (harvest) all timeseries for all devices.
    """

    @Config(ptype=int, default=None, required=True)
    def max_proc_time_secs(self):
        """
        The maximum time in seconds we should continue processing input.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def progress_table(self):
        """
        The Postgres table tracking all last processed days/hours for each device.

        Required: True

        Default: None
        """
        pass

    """
    Raw Sensor REST API (CityGIS) TimeSeries (History) fetcher/formatter.
    
    Fetching all timeseries data via the Raw Sensor API (RSA) from CityGIS server and putting 
    these unaltered into Postgres DB. This is a continuus process.
    Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are
    in harvesting.
    
    Algoritm:
    - fetch all (sensor) devices from RSA
    - for each device:
    - if device is not in progress-table insert and set day,hour to 0
    - if in progress-table fetch entry (day, hour)
    - get timeseries (hours) available for that day
    - fetch and store each, starting with the last hour perviously stored
    - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process)
    - stored entry: device_id, day, hour, last_flag, json blob
    - finish: when all done or when max_proc_time_secs passed 
    """

    def __init__(self, configdict, section, produces=FORMAT.record_array):
        RawSensorAPIInput.__init__(self, configdict, section, produces)
        
        # keep track of root base REST URL
        self.url = None
        
        self.current_time_secs = lambda: int(round(time.time()))
        self.start_time_secs = self.current_time_secs()

        self.days = []
        self.days_idx = -1
        self.day = -1
        self.day_last = -1

        self.hours = []
        self.hours_idx = -1
        self.hour = -1
        self.hour_last = -1
        self.db = None

        self.progress_query = "SELECT * from %s where device_id=" % self.progress_table

    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

        # One time: get all device ids
        self.fetch_devices()

        # Pick a first device id
        # self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx)

    def all_done(self):
        if self.device_ids_idx < 0 and self.days_idx < 0 and self.hours_idx < 0:
            return True
        return False

    def has_expired(self):
        if (self.current_time_secs() - self.start_time_secs) > self.max_proc_time_secs:
            return True
        return False

    def fetch_ts_days(self):
        self.days_idx = -1
        self.days = []
        self.day = -1

        if self.device_id < 0:
            return
        
        ts_days_url = self.base_url + '/devices/%d/timeseries' % self.device_id
        log.info('Init: fetching timeseries days list from URL: "%s" ...' % ts_days_url)

        json_str = self.read_from_url(ts_days_url)
        json_obj = self.parse_json_str(json_str)

        # Typical entry is: "/sensors/v1/devices/8/timeseries/20160404"
        # cut of last
        days_raw = json_obj['days']

        row_count = self.db.execute(self.progress_query + str(self.device_id))
        self.day_last = -1
        self.hour_last = -1
        if row_count > 0:
            progress_rec = self.db.cursor.fetchone()
            self.day_last = progress_rec[4]
            self.hour_last = progress_rec[5]

        # Take a subset of all days: namely those still to be processed
        # Always include the last/current day as it may not be complete
        for d in days_raw:
            day = int(d.split('/')[-1])
            if day >= self.day_last:
                self.days.append(day)

        if len(self.days) > 0:
            self.days_idx = 0
            
        log.info('Device: %d, raw days: %d, days=%d, day_last=%d, hour_last=%d' % (self.device_id, len(days_raw), len(self.days), self.day_last, self.hour_last))

    def fetch_ts_hours(self):
        self.hours_idx = -1
        self.hours = []
        self.hour = None
        if self.device_id == -1 or self.day == -1:
            return

        # 2016-10-30 08:12:09,921 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7
        # 2016-10-30 08:12:09,922 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ...
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO 1 processable hours for device 55 day 20161030
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7)
        # 2016-10-30 08:26:59,172 RawSensorAPI INFO Device: 55, raw days: 5, days=1, day_last=20161030, hour_last=7
        # 2016-10-30 08:26:59,172 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030" ...
        # 2016-10-30 08:26:59,807 RawSensorAPI INFO 1 processable hours for device 55 day 20161030
        # 2016-10-30 08:26:59,808 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/55/timeseries/20161030/8

        # 2016-10-30 10:37:30,010 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ...
        # 2016-10-30 10:37:30,170 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9
        # 2016-10-30 10:37:30,170 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:37:30,525 RawSensorAPI INFO 1 processable hours for device 71 day 20161030
        # 2016-10-30 10:37:30,525 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-10 (it is still sampling current hour 9)
        # 2016-10-30 10:47:17,095 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=9
        # 2016-10-30 10:47:17,095 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:47:17,511 RawSensorAPI INFO 1 processable hours for device 71 day 20161030
        # 2016-10-30 10:47:17,511 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/10
        # 2016-10-30 10:57:12,325 RawSensorAPI INFO Init: fetching timeseries days list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries" ...
        # 2016-10-30 10:57:12,524 RawSensorAPI INFO Device: 71, raw days: 7, days=1, day_last=20161030, hour_last=10
        # 2016-10-30 10:57:12,524 RawSensorAPI INFO Init: fetching timeseries hours list from URL: "http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030" ...
        # 2016-10-30 10:57:12,952 RawSensorAPI INFO 0 processable hours for device 71 day 20161030

        # 2016-10-30 12:29:11,534 RawSensorAPI INFO self.url = http://whale.citygis.nl/sensors/v1/devices/71/timeseries/20161030/11 cur_day=20161030 cur_hour=11
        # 2016-10-30 12:29:13,177 RawSensorAPI INFO Skipped device-day-hour: 71-20161030-12 (it is still sampling current hour 11)

        ts_hours_url = self.base_url + '/devices/%d/timeseries/%d' % (self.device_id, self.day)
        log.info('Init: fetching timeseries hours list from URL: "%s" ...' % ts_hours_url)
        # Set the next "last values" URL for device and increment to next
        json_str = self.read_from_url(ts_hours_url)
        json_obj = self.parse_json_str(json_str)
        hours_all = json_obj['hours']

        # Get the current day and hour in UTC
        current_day, current_hour = self.get_current_day_hour()
        for h in hours_all:
            hour = int(h)
            if self.day > self.day_last or (self.day == self.day_last and hour > self.hour_last):
                if self.day_last == current_day and hour - 1 >= current_hour:
                    # never append the last hour of today
                    log.info('Skip current hour from %d to %d for device %d on day %d' % (hour-1, hour, self.device_id, self.day))
                else:
                    self.hours.append(hour)

        if len(self.hours) > 0:
            self.hours_idx = 0
        log.info('processable hours for device %d day %d: %s' % (self.device_id, self.day, str(self.hours)))

    def next_day(self):
        # All days for current device done? Try next device
        if self.day == -1:
            self.device_id, self.device_ids_idx = self.next_entry(self.device_ids, self.device_ids_idx)

        # If not yet all devices done fetch days current device
        if self.device_id > -1:
            self.fetch_ts_days()
            self.day, self.days_idx = self.next_entry(self.days, self.days_idx)

    def next_hour(self):

        # Pick an hour entry
        self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx)

        while self.hour < 0:

            # Pick a next day entry
            self.day, self.days_idx = self.next_entry(self.days, self.days_idx)

            if self.day < 0:
                self.next_day()

            if self.day > -1:
                self.fetch_ts_hours()

            if self.device_id < 0:
                log.info('Processing all devices done')
                break

            # Pick an hour entry
            self.hour, self.hours_idx = self.next_entry(self.hours, self.hours_idx)

    def get_current_day_hour(self):
        # Get the current day and hour in UTC
        current_time = time.gmtime()
        current_day = int(time.strftime('%Y%m%d', current_time))
        current_hour = int(time.strftime('%H',current_time))
        return current_day, current_hour

    def before_invoke(self, packet):
        """
        Called just before Component invoke.
        """

        # Try to fill in: should point to next hour timeseries REST URL
        self.url = None

        if self.has_expired() or self.all_done():
            # All devices read or timer expiry
            log.info('Processing halted: expired or all done')
            packet.set_end_of_stream()
            return False

        self.next_hour()

        # Get the current day and hour in UTC
        current_day, current_hour = self.get_current_day_hour()

        # Skip harvesting the current hour as it will not yet be complete, so try the next device, hour
        # 2016-10-30 08:12:10,789 RawSensorAPI INFO Skipped device-day-hour: 55-20161030-8 (it is still sampling current hour 7)
        skips = 0
        while self.day == current_day and (self.hour - 1) == current_hour and not self.all_done():
            skips += 1
            log.info('Skip #%d: device-day-hour: %d-%d-%d (still sampling current hour %d)' % (skips, self.device_id, self.day, self.hour, current_hour))
            # Force to skip to next device, sometimes we have an even later hour
            self.next_hour()
            # 30.okt.16: Fix for #24 #25 gaps in data: because next_hour() may jump to next device and unconditionally fetch current hour...
            # so fix is to use while loop until a valid hour available or we are all done

        # Still hours?
        if self.hour > 0:
            # The base method read() will fetch self.url until it is set to None
            # <base_url>/devices/14/timeseries/20160603/18
            self.url = self.base_url + '/devices/%d/timeseries/%d/%d' % (self.device_id, self.day, self.hour)
            log.info('self.url = %s cur_day=%d cur_hour=%d' % (self.url, current_day, current_hour))

        if self.device_id < 0:
            log.info('Processing all devices done')
            return True

        # ASSERT : still device(s) to be done get next hour to process
        return True

    # Create a data record for timeseries of current device/day/hour
    def format_data(self, data):

        #
        # -- Map this to
        # CREATE TABLE smartem_raw.timeseries (
        #   gid serial,
        #   unique_id character varying (16),
        #   insert_time timestamp with time zone default current_timestamp,
        #   device_id integer,
        #   day integer,
        #   hour integer,
        #   data json,
        #   complete boolean default false,
        #   PRIMARY KEY (gid)
        # );


        # Create record with JSON text blob with metadata
        record = dict()
        record['unique_id'] = '%d-%d-%d' % (self.device_id, self.day, self.hour)

        # Timestamp of sample
        record['device_id'] = self.device_id
        record['day'] = self.day
        record['hour'] = self.hour

        # Add JSON text blob
        record['data'] = data

        return record

Ejemplo n.º 37

0

Mostrar archivo

class PostgresDbInput(SqlDbInput):
    """
    Input by querying records from a Postgres database.
    Input is a query, like SELECT * from mytable.
    Output is zero or more records as record array (array of dict) or single record (dict).

    produces=FORMAT.record_array (default) or FORMAT.record
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        SqlDbInput.__init__(self, configdict, section)
        self.db = None

    def init_columns(self):
        if self.columns is not None:
            # Already initialized, reset columns_names to re-init
            return

        if self.column_names is None:
            # If no explicit column names given, get all columns from DB meta info
            self.columns = self.db.get_column_names(self.cfg.get('table'),
                                                    self.cfg.get('schema'))
        else:
            # Columns provided: make list
            self.columns = self.column_names.split(',')

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()
        self.init_columns()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')

        self.db.disconnect()

    def raw_query(self, query_str):
        self.init_columns()

        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records

Ejemplo n.º 38

0

Mostrar archivo

Archivo: dboutput.py Proyecto: fsteggink/stetl

 def init(self):
     # Connect only once to DB
     log.info('Init: connect to DB')
     self.db = PostGIS(self.cfg.get_dict())
     self.db.connect()

Ejemplo n.º 39

0

Mostrar archivo

Archivo: weewxdbinput.py Proyecto: RobertoTjesse/sospilot

class WeewxDbInput(SqliteDbInput):
    """
    Reads weewx raw archive records from SQLite.
    """
    def __init__(self, configdict, section):
        SqliteDbInput.__init__(self, configdict, section)
        self.progress_query = self.cfg.get('progress_query')
        self.progress_update = self.cfg.get('progress_update')

        # Connect only once to DB
        log.info('Init: connect to Postgres DB')
        self.progress_db = PostGIS(self.cfg.get_dict())
        self.progress_db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.progress_db.disconnect()

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
        # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S')
        ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id))

        log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local))
        self.progress_db.execute(self.progress_update % (self.last_id, ts_local))
        self.progress_db.commit(close=False)
        log.info('Update progress table ok')
        return True

    def read(self, packet):

        # Get last processed id of archive table
        self.progress_db.execute(self.progress_query)
        progress_rec = self.progress_db.cursor.fetchone()
        self.last_id = progress_rec[3]
        log.info('progress record: %s' % str(progress_rec))

        # Fetch next batch of archive records
        archive_recs = self.do_query(self.query % self.last_id)

        log.info('read archive_recs: %d' % len(archive_recs))

        # No more records to process?
        if len(archive_recs) == 0:
            packet.set_end_of_stream()
            log.info('Nothing to do. All file_records done')
            return packet

         # Remember last id processed for next query
        self.last_id = archive_recs[len(archive_recs)-1].get('dateTime')

        packet.data = archive_recs

        # Always stop after batch, otherwise we would continue forever
        packet.set_end_of_stream()

        return packet

Ejemplo n.º 40

0

Mostrar archivo

Archivo: dboutput.py Proyecto: fsteggink/stetl

class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (
            self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet

Ejemplo n.º 41

0

Mostrar archivo

Archivo: influxdbinput.py Proyecto: giovibal/smartemission

class HarvesterInfluxDbInput(InfluxDbInput):
    """
    InfluxDB TimeSeries (History) fetcher/formatter.

    Fetching all timeseries data from InfluxDB and putting
    these unaltered into Postgres DB. This is a continuus process.
    Strategy is to use checkpointing: keep track of each sensor/timeseries how far we are
    in harvesting.

    Algoritm:
    - fetch all Measurements (table names)
    - for each Measurement:
    - if Measurement (name) is not in progress-table insert and set day,hour to 0
    - if in progress-table fetch entry (day, hour)
    - get timeseries (hours) available for that day
    - fetch and store each, starting with the last hour previously stored
    - ignore timeseries for current day/hour, as the hour will not be yet filled (and Refiner may else already process)
    - stored entry: measurement, day, hour, json blob
    - finish: when all done or when max_proc_time_secs passed
    """
    @Config(ptype=int, default=None, required=True)
    def max_proc_time_secs(self):
        """
        The maximum time in seconds we should continue processing input.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=False)
    def data_param_prefix(self):
        """
        The prefix string to place before each parameter name in data, e.g. 'ase_'.

        Required: False

        Default: None
        """
        pass

    @Config(ptype=dict, default=None, required=False)
    def meas_name_to_device_id(self):
        """
        How to map InfluxDB Measurement names to SE device id's.
        e.g. {'Geonovum1' : '1181001', 'RIVM2' : '1181002'}

        Required: False

        Default: None
        """
        pass

    @Config(ptype=str, default=None, required=True)
    def progress_table(self):
        """
        The Postgres table tracking all last processed days/hours for each device.

        Required: True

        Default: None
        """
        pass

    @Config(ptype=str, required=False, default='localhost')
    def pg_host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def pg_port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=True)
    def pg_database(self):
        """
        database name
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def pg_user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def pg_password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def pg_schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    def __init__(self, configdict, section):
        InfluxDbInput.__init__(self, configdict, section)
        self.current_time_secs = lambda: int(round(time.time()))
        self.start_time_secs = self.current_time_secs()
        self.progress_query = "SELECT * from %s where device_id=" % self.progress_table
        self.measurements = None
        self.measurements_info = []
        self.index_m = -1
        self.query = "SELECT * FROM %s WHERE time >= %d AND time < %d + 1h"

    def init(self):
        InfluxDbInput.init(self)
        postgis_cfg = {
            'host': self.pg_host,
            'port': self.pg_port,
            'database': self.pg_database,
            'user': self.pg_user,
            'password': self.pg_password,
            'schema': self.pg_schema
        }
        self.db = PostGIS(postgis_cfg)
        self.db.connect()

        # One time: get all measurements and related info and store in structure
        self.measurements = self.query_db('SHOW MEASUREMENTS')
        for measurement in self.measurements:
            measurement_name = measurement['name']
            date_start_s = self.query_db(
                'SELECT FIRST(calibrated), time FROM %s' %
                measurement_name)[0]['time']
            start_ts = self.date_str_to_ts_nanos(date_start_s)
            date_end_s = self.query_db(
                'SELECT LAST(calibrated), time FROM %s' %
                measurement_name)[0]['time']
            end_ts = self.date_str_to_ts_nanos(date_end_s)
            device_id = measurement_name
            if self.meas_name_to_device_id:
                if measurement_name not in self.meas_name_to_device_id:
                    log.error(
                        'No device_id mapped for measurement (table) %s' %
                        measurement_name)
                    raise Exception

                device_id = self.meas_name_to_device_id[measurement_name]

            # Shift time for current_ts from progress table if already in progress
            # otherwise use start time of measurement.
            current_ts = start_ts
            row_count = self.db.execute(self.progress_query + device_id)
            if row_count > 0:
                progress_rec = self.db.cursor.fetchone()
                ymd_last = str(progress_rec[4])
                year_last = ymd_last[0:4]
                month_last = ymd_last[4:6]
                day_last = ymd_last[6:]
                hour_last = progress_rec[5]
                # e.g. 2017-11-17T11:00:00.411Z
                date_str = '%s-%s-%sT%d:00:00.0Z' % (year_last, month_last,
                                                     day_last, hour_last)
                current_ts = self.date_str_to_ts_nanos(date_str)
                # skip to next hour
                current_ts += (3600 * NANOS_FACTOR)

            # Store all info per device (measurement table) in list of dict
            self.measurements_info.append({
                'name': measurement_name,
                'date_start_s': date_start_s,
                'start_ts': start_ts,
                'date_end_s': date_end_s,
                'end_ts': end_ts,
                'current_ts': current_ts,
                'device_id': device_id
            })

        print(str(self.measurements_info))

    def all_done(self):
        return len(self.measurements_info) == 0

    def has_expired(self):
        if (self.current_time_secs() -
                self.start_time_secs) > self.max_proc_time_secs:
            return True
        return False

    def next_measurement_info(self):
        self.index_m += 1
        return self.measurements_info[self.index_m %
                                      len(self.measurements_info)]

    def del_measurement_info(self):
        if not self.all_done():
            del self.measurements_info[self.index_m %
                                       len(self.measurements_info)]

    def before_invoke(self, packet):
        if self.has_expired() or self.all_done():
            # All devices read or timer expiry
            log.info('Processing halted: expired or all done')
            packet.set_end_of_stream()
            return False

    def date_str_to_ts_nanos(self, date_str):
        # See https://aboutsimon.com/blog/2013/06/06/Datetime-hell-Time-zone-aware-to-UNIX-timestamp.html
        # e.g. 2017-11-17T11:00:00.411Z
        timestamp = timegm(
            time.strptime(date_str.replace('Z', 'GMT'),
                          '%Y-%m-%dT%H:%M:%S.%f%Z'))

        # print(timestamp)
        # Shift timestamp to next whole hour
        timestamp = (timestamp - (timestamp % 3600) + 3600) * NANOS_FACTOR
        # d = datetime.utcfromtimestamp(timestamp)
        # print('-> %s' % d.isoformat())
        return timestamp

    # def next_whole_hour_from_date(self, date):
    #     date_s = self.query_db('SELECT FIRST(calibrated), time FROM %s' % measurement)[0]['time']
    #     return parser.parse(date_s)

    def read(self, packet):
        measurement_info = self.next_measurement_info()

        current_ts_nanos = measurement_info['current_ts']
        current_ts_secs = current_ts_nanos / NANOS_FACTOR
        query = self.query % (measurement_info['name'], current_ts_nanos,
                              current_ts_nanos)
        data = self.query_db(query)

        if len(data) >= 1:
            d = datetime.utcfromtimestamp(current_ts_secs)
            day = '%d%d%d' % (d.year, d.month, d.day)
            hour = '%d' % (d.hour + 1)
            # DEBUG: store only first and last of hour-series
            # data_first = data[0]
            # data_last = data[len(data)-1]
            data_o = data
            data = []
            for i in range(0, 24):
                data.append(data_o[i])
            # data.append(data_first)
            # data.append(data_last)
            packet.data = self.format_data(measurement_info['device_id'], day,
                                           hour, data)

        # Shift time an hour for this device
        current_ts_nanos = (current_ts_secs + 3600) * NANOS_FACTOR
        if current_ts_nanos > measurement_info['end_ts']:
            # all done for current measurement/device
            self.del_measurement_info()
        else:
            # Shift to next hour for this measurement
            measurement_info['current_ts'] = current_ts_nanos

        return packet

    # Create a data record for timeseries of current device/day/hour
    def format_data(self, device_id, day, hour, data):

        #
        # -- Map this to
        # CREATE TABLE smartem_raw.timeseries (
        #   gid serial,
        #   unique_id character varying (16),
        #   insert_time timestamp with time zone default current_timestamp,
        #   device_id integer,
        #   day integer,
        #   hour integer,
        #   data json,
        #   complete boolean default false,
        #   PRIMARY KEY (gid)
        # );

        # Create record with JSON text blob with metadata
        record = dict()
        record['unique_id'] = '%s-%s-%s' % (device_id, day, hour)

        # Timestamp of sample
        record['device_id'] = device_id
        record['day'] = day
        record['hour'] = hour

        # Optional prefix for each param, usually sensor-box type e.g. "ase_"
        if self.data_param_prefix:
            for data_elm in data:
                keys = data_elm.keys()
                # https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary
                for key in keys:
                    data_elm[self.data_param_prefix + key] = data_elm.pop(key)

        # Add JSON text blob
        record['data'] = json.dumps({
            'id': device_id,
            'date': day,
            'hour': hour,
            'timeseries': data
        })

        return record