Exemple #1
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (
            self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet
Exemple #2
0
class Josene(Device):

    def __init__(self):
        Device.__init__(self, 'jose')
        self.model_query = "SELECT id,parameters,model from calibration_models WHERE predicts = '%s' AND invalid = FALSE ORDER BY timestamp DESC LIMIT 1"
        self.state_query = "SELECT state from calibration_state WHERE process = '%s' AND model_id = %d ORDER BY timestamp DESC LIMIT 1"
        self.state_insert = "INSERT INTO calibration_state (process, model_id, state) VALUES ('%s', %d, '%s')"
        self.sensor_model_names = {
            'co': 'carbon_monoxide__air_',
            'no2': 'nitrogen_dioxide__air_',
            'o3': 'ozone__air_'
        }
        self.config_dict = None

    def init(self, config_dict):

        self.config_dict = config_dict
        self.process_name = config_dict['process_name']
        self.db = PostGIS(config_dict)
        self.db.connect()

        ids = dict()
        parameters = dict()
        models = dict()
        state = dict()

        # Query ANN Calibration Model and its State from DB for each calibrated sensor.
        if self.model_query is not None and len(self.sensor_model_names) > 0:
            log.info('Getting calibration models and state from database')
            for k in self.sensor_model_names:
                v = self.sensor_model_names[k]
                id, param, model = self.query_model(v)
                ids[k] = id
                parameters[k] = param
                models[k] = model

                model_state = self.query_state(id)
                state[k] = model_state

        else:
            log.info('No query for fetching calibration models given or no '
                     'mapping for calibration models to gas components given.')

        # Put Model and State info in the Device definitions.
        for k in ids:
            SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k]
        for k in parameters:
            SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k]
        for k in models:
            SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k]
        for k, v in state.iteritems():
            for device_id, device_state in v.iteritems():
                for gas, state in device_state.iteritems():
                    v[device_id][gas] = RunningMean.from_dict(state)
            SENSOR_DEFS[k]['converter_model']['state'] = v

    def exit(self):
        # Save the calibration state.
        for k in self.sensor_model_names:
            model = SENSOR_DEFS[k]['converter_model']
            self.save_state(model['model_id'], json.dumps(model['state']))

        self.db.commit(close=False)

    def get_sensor_defs(self):
        return SENSOR_DEFS

    def raw_query(self, query_str):
        self.db.execute(query_str)

        db_records = self.db.cursor.fetchall()
        log.info('read recs: %d' % len(db_records))

        return db_records

    def query_model(self, name):
        query = self.model_query % name
        log.info('Getting calibration model with query: %s' % query)
        ret = self.raw_query(query)
        if len(ret) > 0:
            id, parameters, model = ret[0]
            return id, parameters, pickle.loads(model)
        else:
            log.warn("No model found for %s" % name)
            return None, {}, {}

    def query_state(self, model_id):
        query = self.state_query % (self.process_name, model_id)
        log.info('Getting calibration model state with query: %s' % query)
        ret = self.raw_query(query)
        if len(ret) > 0:
            return ret[0][0]
        else:
            log.warn("No state found for model_id=%d" % model_id)
            return {}

    def save_state(self, model_id, state):
        insert_query = self.state_insert % (self.process_name, model_id, state)
        log.info('Inserting calibration model state for process %s model_id=%d' % (self.process_name, model_id))

        ret = self.db.execute(insert_query)
        if ret != 1:
            log.warn('Cannot save state for process %s model_id=%d' % (self.process_name, model_id))

    # Get raw sensor value or list of values
    def get_raw_value(self, name, val_dict):
        val = None
        if type(name) is list:
            name = name[0]
            return self.get_raw_value(name, val_dict)
            # name is list of names
            # for n in name:
            #     if n in val_dict:
            #         if val is None:
            #             val = []
            #         val.append(val_dict[n])
        else:
            # name is single name
            if name in val_dict:
                val = val_dict[name]

        if 'audio' in name:
            # We may have audio encoded in 3 bands
            bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)]
            val = bands[0]

        return val, name

    # Check for valid sensor value
    def check_value(self, name, val_dict, value=None):
        val = None
        if type(name) is list:
            # name is list of names
            for n in name:
                result, reason = self.check_value(n, val_dict, value)
                if result is False:
                    return result, reason
        else:
            # name is single name
            if name not in val_dict and value is None:
                return False, '%s not present' % name
            else:
                if value is not None:
                    val = value
                else:
                    val = val_dict[name]

                if val is None:
                    return False, '%s is None' % name

                if name not in SENSOR_DEFS:
                    return False, '%s not in SENSOR_DEFS' % name

                name_def = SENSOR_DEFS[name]

                # Audio inputs: need to unpack 3 bands and check for decibel vals
                if 'audio' in name:
                    bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)]

                    # determine validity of these 3 bands
                    dbMin = name_def['min']
                    dbMax = name_def['max']
                    err_cnt = 0
                    msg = ''
                    for i in range(0, len(bands)):
                        band_val = bands[i]
                        # accumulate outliers
                        if band_val < dbMin:
                            err_cnt +=1
                            msg += '%s: val(%s) < min(%s)\n' % (name, str(band_val), str(name_def['min']))
                        elif band_val > dbMax:
                            err_cnt +=1
                            msg += '%s: val(%s) > max(%s)\n' % (name, str(band_val), str(name_def['max']))

                    # Only invalid if all bands outside range
                    if err_cnt >= len(bands):
                        return False, msg

                    return True, '%s OK' % name

                if 'min' in name_def and val < name_def['min']:
                    return False, '%s: val(%s) < min(%s)' % (name, str(val), str(name_def['min']))

                if 'max' in name_def and val > name_def['max']:
                    return False, '%s: val(%s) > max(%s)' % (name, str(val), str(name_def['max']))

        return True, '%s OK' % name

    # Get location as lon, lat
    def get_lon_lat(self, val_dict):
        result = (None, None)
        if 's_longitude' in val_dict and 's_latitude' in val_dict:
            lon = SENSOR_DEFS['longitude']['converter'](val_dict['s_longitude'])
            lat = SENSOR_DEFS['latitude']['converter'](val_dict['s_latitude'])

            valid, reason = self.check_value('latitude', val_dict, value=lat)
            if not valid:
                return result

            valid, reason = self.check_value('longitude', val_dict, value=lon)
            if not valid:
                return result

            result = (lon, lat)

        return result
Exemple #3
0
    def write(self, packet):
        if packet.data is None:
            return packet

        gml_doc = packet.data
        log.info('inserting features in DB')
        db = PostGIS(self.cfg.get_dict())
        db.connect()
        #        print self.to_string(gml_doc, False, False)
        #        NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'}
        #        featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS)
        featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag)
        count = 0
        gml_ns = None
        for childNode in featureMembers:
            if gml_ns is None:
                if childNode.nsmap.has_key('gml'):
                    gml_ns = childNode.nsmap['gml']
                else:
                    if childNode.nsmap.has_key('GML'):
                        gml_ns = childNode.nsmap['GML']

            gml_id = childNode.get('{%s}id' % gml_ns)

            feature_type_id = self.feature_type_ids[childNode.tag]

            # Find a GML geometry in the GML NS
            ogrGeomWKT = None
            #            gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS)
            gmlMembers = childNode.xpath(
                ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']")
            geom_str = None
            for gmlMember in gmlMembers:
                if geom_str is None:
                    geom_str = etree.tostring(gmlMember)
                #                   no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT
            #                    ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr))
            #                    if ogrGeom is not None:
            #                        ogrGeomWKT = ogrGeom.ExportToWkt()
            #                        if ogrGeomWKT is not None:
            #                            break

            blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8')

            if geom_str is None:
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob))
            else:
                # ST_SetSRID(ST_GeomFromGML(%s)),-1)
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid)

            if db.execute(sql, parameters) == -1:
                log.error("feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob))

                # will fail but we will close connection also
                db.commit()

                # proceed...
                log.info('retrying to proceed with remaining features...')
                db = PostGIS(self.cfg.get_dict())
                db.connect()
                count = 0

            count += 1

        exception = db.commit()
        if exception is not None:
            log.error("error in commit")

        log.info("inserted %s features" % count)
        return packet
Exemple #4
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting a single record in a Postgres database table.
    Input is a Stetl record (Python dict structure) or a list of records.
    Creates an INSERT for Postgres to insert each single record.
    When the "replace" parameter is True, any existing record keyed by "key" is
    attempted to be UPDATEd first.

    NB a constraint is that the first and each subsequent each record needs to contain
    all values as an INSERT and UPDATE query template is built once for the columns
    in the first record.

    consumes=[FORMAT.record_array, FORMAT.record]
    """

    # Start attribute config meta
    @Config(ptype=str, required=False, default='public')
    def table(self):
        """
        Table for inserts.
        """
        pass

    @Config(ptype=bool, required=False, default=False)
    def replace(self):
        """
        Replace record if exists?
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def key(self):
        """
        The key column name of the table, required when replacing records.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self,
                          configdict,
                          section,
                          consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.update_query = None
        self.db = None

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (
            self.cfg.get('table'), ",".join(['%s' % k
                                             for k in record]), ",".join([
                                                 "%s",
                                             ] * len(record.keys())))
        log.info('query is %s', query)
        return query

    def create_update_query(self, record):
        # We assume that all records do the same UPDATE key/values
        # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
        # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3;
        query = "UPDATE %s SET (%s) = (%s) WHERE  %s = %s" % (self.cfg.get(
            'table'), ",".join(['%s ' % k for k in record]), ",".join([
                "%s",
            ] * len(record.keys())), self.key, "%s")
        log.info('update query is %s', query)
        return query

    def insert(self, record):
        res = 0
        if self.replace and self.key and self.key in record:

            # Replace option: try UPDATE if existing
            # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838
            values = record.values()
            values.append(record[self.key])
            res = self.db.execute(self.update_query, values)
            # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key])
            # res = self.db.execute(del_query)

        if res < 1:
            # Do insert with values from the record dict
            # only if we did not do an UPDATE (res==0) on existing record.
            self.db.execute(self.query, record.values())
        self.db.commit(close=False)

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create INSERT and optional UPDATE query-templates once
        if self.query is None:
            self.query = self.create_query(first_record)

        if self.replace and self.key and not self.update_query:
            self.update_query = self.create_update_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.insert(record)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
            # Multiple records in list
            for rec in record:
                # Do insert with values from the record
                self.insert(rec)

            log.info('committed %d records' % len(record))

        return packet
Exemple #5
0
class DeegreeBlobstoreInput(Input):
    """
    Read features from deegree Blobstore DB into an etree doc.

    produces=FORMAT.etree_doc
    """

    # Start attribute config meta

    @Config(ptype=int, required=False, default=10000)
    def max_features_per_doc(self):
        """
        Max features to read from input feature GML stream per internal document.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def start_container(self):
        """
        Tag that starts container.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def end_container(self):
        """
        Tag that ends container.
        """
        pass

    @Config(ptype=str, required=False, default=False)
    def start_feature_tag(self):
        """
        XML tag that starts Feature.
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def end_feature_tag(self):
        """
        XML tag that ends Feature.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        Input.__init__(self, configdict, section, produces=FORMAT.etree_doc)
        self.cur_feature_blob = None
        self.rowcount = 0

        # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/
        self.regex_xlink_href = re.compile(
            "\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))"
        )

        self.db = None
        self.xlink_db = None
        self.buffer = None
        self.feature_count = 0
        # Reusable XML parser
        self.xml_parser = etree.XMLParser(remove_blank_text=True)

    def init(self):
        pass

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

    def write_feature(self, record):
        feature_blob = str(record[0])

        # Write start-tag, blob element, end-tag
        self.buffer.write(self.start_feature_tag)
        self.buffer.write(feature_blob)
        self.buffer.write(self.end_feature_tag)
        self.feature_count += 1
        return feature_blob

    def init_buf(self):
        buffer = StringIO()
        buffer = codecs.getwriter("utf8")(buffer)
        self.feature_count = 0
        self.xlink_ids = set()
        return buffer

    def buffer_to_doc(self, packet):
        # Process/transform data in buffer
        self.buffer.seek(0)
        try:
            packet.data = etree.parse(self.buffer, self.xml_parser)
        except Exception as e:
            bufStr = self.buffer.getvalue()
            if not bufStr:
                log.info("parse buffer empty: content=[%s]" % bufStr)
            else:
                log.error("error in buffer parsing %s" % str(e))
                raise
        self.buffer.close()
        self.buffer = None
Exemple #6
0
class DeegreeBlobstoreInput(Input):
    """
    Read features from deegree Blobstore DB into an etree doc.

    produces=FORMAT.etree_doc
    """

    # Start attribute config meta

    @Config(ptype=int, required=False, default=10000)
    def max_features_per_doc(self):
        """
        Max features to read from input feature GML stream per internal document.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def start_container(self):
        """
        Tag that starts container.
        """
        pass

    @Config(ptype=str, required=True, default=None)
    def end_container(self):
        """
        Tag that ends container.
        """
        pass

    @Config(ptype=str, required=False, default=False)
    def start_feature_tag(self):
        """
        XML tag that starts Feature.
        """
        pass

    @Config(ptype=str, required=False, default=None)
    def end_feature_tag(self):
        """
        XML tag that ends Feature.
        """
        pass

    # End attribute config meta

    def __init__(self, configdict, section):
        Input.__init__(self, configdict, section, produces=FORMAT.etree_doc)
        self.cur_feature_blob = None
        self.rowcount = 0

        # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/
        self.regex_xlink_href = re.compile("\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))")

        self.db = None
        self.xlink_db = None
        self.buffer = None
        self.feature_count = 0
        # Reusable XML parser
        self.xml_parser = etree.XMLParser(remove_blank_text=True)

    def init(self):
        pass

    def read(self, packet):
        if packet.is_end_of_stream():
            return packet

        if self.db is None:
            # First time read
            log.info("reading records from blobstore..")
            self.db = PostGIS(self.cfg.get_dict())
            self.db.connect()
            sql = self.cfg.get('sql')
            self.rowcount = self.db.execute(sql)
            self.cur = self.db.cursor
            log.info("Read records rowcount=%d" % self.rowcount)

            # Init separate connection to fetch objects referenced by xlink:href
            self.xlink_db = PostGIS(self.cfg.get_dict())
            self.xlink_db.connect()

        # Query active
        while self.cur is not None:
            if self.buffer is None:
                self.buffer = self.init_buf()
                self.buffer.write(self.start_container)

            # Get next blob record
            record = self.cur.fetchone()

            # End of all records
            if record is None:
                # End of records: start closing
                self.buffer.write(self.end_container)
                self.cur = None
                self.db.commit()

                # Only create doc if there are features in the buffer
                if self.feature_count > 0:
                    self.buffer_to_doc(packet)
                packet.set_end_of_doc()
                break
            else:
                # New record: embed feature blob in feature tags and write to buffer
                feature_blob = self.write_feature(record)

                # If we have local xlinks: fetch the related features as well from the DB and
                # output them within the same document (local href resolvable)
                # TODO: in some cases we may need to be recursive (xlinks in xlinked features...)

                # First construct a single query for all xlinks
                xlink_sql = None
                for xlink in self.regex_xlink_href.finditer(feature_blob):
                    gml_id = xlink.group(1).strip('"').strip('#')
                    # We don't want multiple occurences of the same xlinked feature
                    if gml_id in self.xlink_ids:
                        continue

                    self.xlink_ids.add(gml_id)
                    if xlink_sql is None:
                        xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id
                    else:
                        xlink_sql += "OR gml_id = '%s'" % gml_id

                # Should we retrieve and write xlinked features?
                if xlink_sql is not None:
                    # Fetch from DB
                    self.xlink_db.execute(xlink_sql)
                    while True:
                        # Get next blob record
                        xlink_record = self.xlink_db.cursor.fetchone()
                        if xlink_record is None:
                            break
                        self.write_feature(xlink_record)

                # Should we output a doc
                if self.feature_count >= self.max_features_per_doc:
                    # End of records: create XML doc
                    self.buffer.write(self.end_container)
                    self.buffer_to_doc(packet)
                    break

        if self.cur is None:
            # All records handled: close off
            packet.set_end_of_stream()
            # log.info("[%s]" % packet.data)

        return packet

    def write_feature(self, record):
        feature_blob = str(record[0])

        # Write start-tag, blob element, end-tag
        self.buffer.write(self.start_feature_tag)
        self.buffer.write(feature_blob)
        self.buffer.write(self.end_feature_tag)
        self.feature_count += 1
        return feature_blob

    def init_buf(self):
        buffer = StringIO()
        buffer = codecs.getwriter("utf8")(buffer)
        self.feature_count = 0
        self.xlink_ids = set()
        return buffer

    def buffer_to_doc(self, packet):
        # Process/transform data in buffer
        self.buffer.seek(0)
        try:
            packet.data = etree.parse(self.buffer, self.xml_parser)
        except Exception as e:
            bufStr = self.buffer.getvalue()
            if not bufStr:
                log.info("parse buffer empty: content=[%s]" % bufStr)
            else:
                log.error("error in buffer parsing %s" % str(e))
                raise
        self.buffer.close()
        self.buffer = None
    def write(self, packet):
        if packet.data is None:
            return packet

        gml_doc = packet.data
        log.info('inserting features in DB')
        db = PostGIS(self.cfg.get_dict())
        db.connect()
        #        print self.to_string(gml_doc, False, False)
        #        NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'}
        #        featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS)
        featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" %
                                       self.feature_member_tag)
        count = 0
        gml_ns = None
        for childNode in featureMembers:
            if gml_ns is None:
                if childNode.nsmap.has_key('gml'):
                    gml_ns = childNode.nsmap['gml']
                else:
                    if childNode.nsmap.has_key('GML'):
                        gml_ns = childNode.nsmap['GML']

            gml_id = childNode.get('{%s}id' % gml_ns)

            feature_type_id = self.feature_type_ids[childNode.tag]

            # Find a GML geometry in the GML NS
            ogrGeomWKT = None
            #            gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS)
            gmlMembers = childNode.xpath(
                ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']"
            )
            geom_str = None
            for gmlMember in gmlMembers:
                if geom_str is None:
                    geom_str = etree.tostring(gmlMember)
                #                   no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT
            #                    ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr))
            #                    if ogrGeom is not None:
            #                        ogrGeomWKT = ogrGeom.ExportToWkt()
            #                        if ogrGeomWKT is not None:
            #                            break

            blob = etree.tostring(childNode,
                                  pretty_print=False,
                                  xml_declaration=False,
                                  encoding='UTF-8')

            if geom_str is None:
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob))
            else:
                # ST_SetSRID(ST_GeomFromGML(%s)),-1)
                sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )"
                parameters = (gml_id, feature_type_id, db.make_bytea(blob),
                              geom_str, self.srid)

            if db.execute(sql, parameters) == -1:
                log.error(
                    "feat num# = %d error inserting feature blob=%s (but continuing)"
                    % (count, blob))

                # will fail but we will close connection also
                db.commit()

                # proceed...
                log.info('retrying to proceed with remaining features...')
                db = PostGIS(self.cfg.get_dict())
                db.connect()
                count = 0

            count += 1

        exception = db.commit()
        if exception is not None:
            log.error("error in commit")

        log.info("inserted %s features" % count)
        return packet
class WeewxDbInput(SqliteDbInput):
    """
    Reads weewx raw archive records from SQLite.
    """
    def __init__(self, configdict, section):
        SqliteDbInput.__init__(self, configdict, section)
        self.progress_query = self.cfg.get('progress_query')
        self.progress_update = self.cfg.get('progress_update')

        # Connect only once to DB
        log.info('Init: connect to Postgres DB')
        self.progress_db = PostGIS(self.cfg.get_dict())
        self.progress_db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.progress_db.disconnect()

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
        # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S')
        ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id))

        log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local))
        self.progress_db.execute(self.progress_update % (self.last_id, ts_local))
        self.progress_db.commit(close=False)
        log.info('Update progress table ok')
        return True

    def read(self, packet):

        # Get last processed id of archive table
        self.progress_db.execute(self.progress_query)
        progress_rec = self.progress_db.cursor.fetchone()
        self.last_id = progress_rec[3]
        log.info('progress record: %s' % str(progress_rec))

        # Fetch next batch of archive records
        archive_recs = self.do_query(self.query % self.last_id)

        log.info('read archive_recs: %d' % len(archive_recs))

        # No more records to process?
        if len(archive_recs) == 0:
            packet.set_end_of_stream()
            log.info('Nothing to do. All file_records done')
            return packet

         # Remember last id processed for next query
        self.last_id = archive_recs[len(archive_recs)-1].get('dateTime')

        packet.data = archive_recs

        # Always stop after batch, otherwise we would continue forever
        packet.set_end_of_stream()

        return packet
class ProgressTracker(Filter):
    """"
    Filter to track progress of a stream of processed records.
    Stores progress (last id, last timestamp etc) in Postgres table.
    """

    @Config(ptype=str, required=False, default='localhost')
    def host(self):
        """
        host name or host IP-address, defaults to 'localhost'
        """
        pass

    @Config(ptype=str, required=False, default='5432')
    def port(self):
        """
        port for host, defaults to '5432'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def user(self):
        """
        User name, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='postgres')
    def password(self):
        """
        User password, defaults to 'postgres'
        """
        pass

    @Config(ptype=str, required=False, default='public')
    def schema(self):
        """
        The postgres schema name, defaults to 'public'
        """
        pass

    @Config(ptype=str, required=False, default='progress')
    def table(self):
        """
        Table name, defaults to 'progress'.
        """
        pass

    @Config(ptype=str, required=True)
    def progress_update_query(self):
        """
        Query to update progress

        Required: True

        Default: ""
        """
        pass

    @Config(ptype=str, required=True)
    def id_key(self):
        """
        Key to select id from record array

        Required: True
        """

    @Config(ptype=str, default=None, required=False)
    def name_key(self):
        """
        Key to select name from record array

        Required: True
        """

    def __init__(self, config_dict, section):
        Filter.__init__(self, config_dict, section,
                        consumes=[FORMAT.record_array, FORMAT.record],
                        produces=[FORMAT.record_array, FORMAT.record])
        self.last_ids = None
        self.db = None
        
    def init(self):
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def invoke(self, packet):
        self.last_ids = dict()

        if packet.data is None or packet.is_end_of_doc() or packet.is_end_of_stream():
            log.info("No packet data or end of doc/stream")
            return packet

        record_in = packet.data
        if type(record_in) is not list:
            record_in = [record_in]

        for record in record_in:
            if self.name_key is not None:
                name = record[self.name_key]
            else:
                name = "all"
            if len(record) > 0:
                new = record[self.id_key]
                self.last_ids[name] = max(self.last_ids.get(name, -1), new)

        log.info("Last ids are: %s", str(self.last_ids))

        return packet

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
        for name in self.last_ids:
            param_tuple = (self.last_ids[name], name)
            log.info('Updating progress table with (id=%d, name=%s)' % param_tuple)
            self.db.execute(self.progress_update_query % param_tuple)
            self.db.commit(close=False)
            log.info('Update progress table ok')
        else:
            log.info('No update for progress table')
        
        return True
Exemple #10
0
class PostgresInsertOutput(PostgresDbOutput):
    """
    Output by inserting single record into Postgres database.
    Input is a record (Python dic structure) or a Python list of dicts (records).
    Creates an INSERT for Postgres to insert each single record.

    consumes=FORMAT.record
    """

    def __init__(self, configdict, section, consumes=FORMAT.record):
        DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record])
        self.query = None
        self.db = None
        self.key = self.cfg.get('key')

    def init(self):
        # Connect only once to DB
        log.info('Init: connect to DB')
        self.db = PostGIS(self.cfg.get_dict())
        self.db.connect()

    def exit(self):
        # Disconnect from DB when done
        log.info('Exit: disconnect from DB')
        self.db.disconnect()

    def create_query(self, record):
        # We assume that all records do the same INSERT key/values
        # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns
        # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s)
        query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys())))
        log.info('query is %s', query)
        return query

    def write(self, packet):
        # Deal with empty or zero-length data structures (list or dict)
        if packet.data is None or len(packet.data) == 0:
            return packet

        # ASSERT: record data present

        # record is Python dict (single record) or list of Python dict (multiple records)
        record = packet.data

        # Generate INSERT query template once
        first_record = record
        if type(record) is list and len(record) > 0:
            first_record = record[0]

        # Create query once
        if self.query is None:
            self.query = self.create_query(first_record)

        # Check if record is single (dict) or array (list of dict)
        if type(record) is dict:
            # Do insert with values from the single record
            self.db.execute(self.query, record.values())
            self.db.commit(close=False)

            # log.info('committed record key=%s' % record[self.key])

        elif type(record) is list:
                # Multiple records in list
                for rec in record:
                    # Do insert with values from the record
                    self.db.execute(self.query, rec.values())
                    self.db.commit(close=False)

                log.info('committed %d records' % len(record))

        return packet