def transform_input_gfs(self, formatted_xslt):
        xslt_doc = etree.fromstring(formatted_xslt)
        xslt_obj = etree.XSLT(xslt_doc)
        xml_doc = etree.parse(self.input_gfs)
        result = xslt_obj(xml_doc)

        return str(result)
    def transform_input_gfs(self, formatted_xslt):
        xslt_doc = etree.fromstring(formatted_xslt.encode('ascii'))
        xslt_obj = etree.XSLT(xslt_doc)
        xml_doc = etree.parse(self.input_gfs)
        result = xslt_obj(xml_doc)

        return str(result)
Example #3
0
    def flush_elements(self, packet):
        packet.set_end_of_doc()

        if len(self.element_arr) == 0:
            return packet

        # Start new doc (TODO clone)
        try:
            etree_doc = etree.fromstring(self.container_doc, self.xml_parser)
        except Exception as e:
            log.error('new container doc not OK: %s' % str(e))
            return packet

        parent_element = etree_doc.xpath(self.element_container_xpath)
        if len(parent_element) > 0:
            parent_element = parent_element[0]

        for element in self.element_arr:
            parent_element.append(element)

        log.info('xmldoc ready: elms=%d total_elms=%d' %
                 (len(self.element_arr), self.total_element_count))
        packet.data = etree_doc
        self.element_arr = []
        return packet
    def transform_input_gfs(self, formatted_xslt):
        # log.info(formatted_xslt)
        xslt_doc = etree.fromstring(bytes(formatted_xslt, encoding='utf8'))
        xslt_obj = etree.XSLT(xslt_doc)
        xml_doc = etree.parse(self.input_gfs)
        result = xslt_obj(xml_doc)

        return str(result)
Example #5
0
    def flush_elements(self, packet):
        if len(self.element_arr) == 0:
            return packet

        # Start new doc (TODO clone)
        try:
            etree_doc = etree.fromstring(self.container_doc, self.xml_parser)
        except Exception, e:
            log.error("new container doc not OK")
            return packet
Example #6
0
    def flush_elements(self, packet):
        if len(self.element_arr) == 0:
            return packet

        # Start new doc (TODO clone)
        try:
            etree_doc = etree.fromstring(self.container_doc, self.xml_parser)
        except Exception, e:
            log.error("new container doc not OK")
            return packet
Example #7
0
    def gdal_vsi_path2etree_doc(packet, converter_args=None):
        from stetl.util import gdal
        import re

        # Example input path:
        # /vsizip/{/vsizip/{BAGGEM0221L-15022021.zip}/GEM-WPL-RELATIE-15022021.zip}/GEM-WPL-RELATIE-15022021-000001.xml
        vsi_file_path = packet.data
        vsi_file = gdal.VSIFOpenL(vsi_file_path, 'rb')
        gdal.VSIFSeekL(vsi_file, 0, 2)
        vsileng = gdal.VSIFTellL(vsi_file)
        gdal.VSIFSeekL(vsi_file, 0, 0)

        # read the XML as string (or bytearray)
        xml_str = gdal.VSIFReadL(1, vsileng, vsi_file)

        # Type is GDAL-version dependent, may be bytes-like
        if type(xml_str) in [bytearray, bytes]:
            xml_str = xml_str.decode('utf-8')

        # Need to strip the XML header to avoid XML parse error
        xml_str = re.sub(r'<\?xml.*?\?>', '', xml_str)
        packet.data = etree.fromstring(xml_str)

        return packet
Example #8
0
    def flush_elements(self, packet):
        packet.set_end_of_doc()

        if len(self.element_arr) == 0:
            return packet

        # Start new doc (TODO clone)
        try:
            etree_doc = etree.fromstring(self.container_doc, self.xml_parser)
        except Exception as e:
            log.error('new container doc not OK: %s' % str(e))
            return packet

        parent_element = etree_doc.xpath(self.element_container_xpath)
        if len(parent_element) > 0:
            parent_element = parent_element[0]

        for element in self.element_arr:
            parent_element.append(element)

        log.info('xmldoc ready: elms=%d total_elms=%d' % (len(self.element_arr), self.total_element_count))
        packet.data = etree_doc
        self.element_arr = []
        return packet
Example #9
0
 def string2etree_doc(packet):
     packet.data = etree.fromstring(packet.data)
     return packet
Example #10
0
 def string2etree_doc(packet):
     packet.data = etree.fromstring(packet.data)
     return packet
Example #11
0
    def read(self, packet):

        # Get last processed id of lml_files table
        rowcount = self.db.execute(self.progress_query)
        progress_rec = self.db.cursor.fetchone()
        self.last_id = progress_rec[3]
        log.info('progress record: %s' % str(progress_rec))

        # Fetch next batch of lml_files records
        lml_file_recs = self.do_query(self.query % self.last_id)
        log.info('read lml_file_recs: %d' % len(lml_file_recs))

        # No more records to process?
        if len(lml_file_recs) == 0:
            packet.set_end_of_stream()
            log.info('Nothing to do. All file_records done')
            return packet

        # Process lml_files records and create recordlist
        record_list = []
        file_format = self.cfg.get('file_format')
        for file_rec in lml_file_recs:
            gid = file_rec.get('gid')
            file_name = file_rec.get('file_name')

            log.info('process: file_rec gid=%d file=%s' % (gid, file_name))

            # Remember last id processed for next query
            self.last_id = gid

            # Parse file data and create a record from XML DOM
            xml_doc = None
            file_data = file_rec.get('file_data')
            if file_data is None or len(file_data) == 0:
                log.warn("cannot process file '%s' error: %s, skipping..." % (file_name, 'no data in file'))
                continue

            try:
                xml_doc = etree.fromstring(file_data)
            except Exception, e:
                log.warn("cannot parse file '%s' error: %s, skipping..." % (file_name, str(e)))
                continue

            # There are two broad file formats from RIVM: 'xml' and 'sos'. The data is the same
            # only the XML format and thus handling is different
            if file_format == 'rivm_xml':
                measurements = xml_doc.xpath('/message/body/*')
                for measurement in measurements:
                    record = dict()
                    # Measurement data XML structue
                    #   <meting>
                    #    <datum>27/05/2014</datum>
                    #    <tijd>14</tijd>
                    #    <station>549</station>
                    #    <component>PM10</component>
                    #    <eenheid>ug/m3</eenheid>
                    #    <waarde>10</waarde>
                    #    <gevalideerd>0</gevalideerd>
                    #   </meting>

                    record['file_name'] = file_name

                    # station_id variants: '318' or  'NL01485'  or 'NL49551'
                    # always take last three digits?
                    record['station_id'] = measurement.xpath("station/text()")[0][-3:]
                    record['component'] = measurement.xpath("component/text()")[0]
                    record['validated'] = measurement.xpath("gevalideerd/text()")[0]
                    record['sample_value'] = measurement.xpath("waarde/text()")[0]

                    # 27/05/2014
                    datum = measurement.xpath("datum/text()")[0]
                    # 14
                    tijd = measurement.xpath("tijd/text()")[0]
                    dt_str = datum + '-' + tijd
                    dt = datetime.strptime(dt_str, '%d/%m/%Y-%H')
                    record['sample_time'] = dt

                    # Create a unique id for the sample station-component-time
                    record['sample_id'] = record['station_id'] + '-' + record['component'] + '-' + dt_str
                    record_list.append(record)

            elif file_format == 'rivm_sos':
                measurements = xml_doc.xpath('/ROWSET/*')

                # Extract date-time string and component name once from file-name
                # 2014070708-PM10.xml becomes date_str=2014070708 component=PM10
                date_str, _, component = file_name[:-4].rpartition('-')

                for measurement in measurements:

                    # first check if there is a valid sample value
                    # for benzeen and e.g. PM2.5 most values are less than zero
                    sample_value = measurement.xpath("MWAA_WAARDE/text()")[0]
                    if sample_value[0:1] == '-':
                        continue

                    record = dict()
                    # <?xml version="1.0"?>
                    # <ROWSET>
                    #  <ROW>
                    #   <OPST_OPDR_ORGA_CODE>DCMR</OPST_OPDR_ORGA_CODE>
                    #   <STAT_NUMMER>NL01483</STAT_NUMMER>
                    #   <STAT_NAAM>Botlek-Spoortunnel</STAT_NAAM>
                    #   <MCLA_CODE>stad verkeer</MCLA_CODE>
                    #   <MWAA_WAARDE>60.3</MWAA_WAARDE>
                    #   <MWAA_BEGINDATUMTIJD>20140707070000</MWAA_BEGINDATUMTIJD>
                    #   <MWAA_EINDDATUMTIJD>20140707080000</MWAA_EINDDATUMTIJD>
                    #  </ROW>
                    #  <ROW>
                    #   <OPST_OPDR_ORGA_CODE>DCMR</OPST_OPDR_ORGA_CODE>
                    #   <STAT_NUMMER>NL01485</STAT_NUMMER>
                    #   <STAT_NAAM>Hoogvliet-Leemkuil</STAT_NAAM>
                    #   <MCLA_CODE>stad achtergr</MCLA_CODE>
                    #   <MWAA_WAARDE>17.5</MWAA_WAARDE>
                    #   <MWAA_BEGINDATUMTIJD>20140707070000</MWAA_BEGINDATUMTIJD>
                    #   <MWAA_EINDDATUMTIJD>20140707080000</MWAA_EINDDATUMTIJD>
                    #  </ROW>

                    record['file_name'] = file_name


                    # station_id variants: '318' or  'NL01485'  or 'NL49551'
                    # always take last three digits?
                    record['station_id'] = measurement.xpath("STAT_NUMMER/text()")[0][-3:]
                    record['component'] = component
                    record['validated'] = 0
                    record['sample_value'] = sample_value

                    dt = datetime.strptime(date_str, '%Y%m%d%H')
                    record['sample_time'] = dt

                    # Create a unique id for the sample station-component-time
                    record['sample_id'] = record['station_id'] + '-' + record['component'] + '-' + date_str
                    record_list.append(record)
Example #12
0
    def read(self, packet):

        # Get last processed id of lml_files table
        rowcount = self.db.execute(self.progress_query)
        progress_rec = self.db.cursor.fetchone()
        self.last_id = progress_rec[3]
        log.info('progress record: %s' % str(progress_rec))

        # Fetch next batch of lml_files records
        lml_file_recs = self.do_query(self.query % self.last_id)
        log.info('read lml_file_recs: %d' % len(lml_file_recs))

        # No more records to process?
        if len(lml_file_recs) == 0:
            packet.set_end_of_stream()
            log.info('Nothing to do. All file_records done')
            return packet

        # Process lml_files records and create recordlist
        record_list = []
        file_format = self.cfg.get('file_format')
        for file_rec in lml_file_recs:
            gid = file_rec.get('gid')
            file_name = file_rec.get('file_name')

            log.info('process: file_rec gid=%d file=%s' % (gid, file_name))

            # Remember last id processed for next query
            self.last_id = gid

            # Parse file data and create a record from XML DOM
            xml_doc = None
            file_data = file_rec.get('file_data')
            if file_data is None or len(file_data) == 0:
                log.warn("cannot process file '%s' error: %s, skipping..." %
                         (file_name, 'no data in file'))
                continue

            try:
                xml_doc = etree.fromstring(file_data)
            except Exception, e:
                log.warn("cannot parse file '%s' error: %s, skipping..." %
                         (file_name, str(e)))
                continue

            # There are two broad file formats from RIVM: 'xml' and 'sos'. The data is the same
            # only the XML format and thus handling is different
            if file_format == 'rivm_xml':
                measurements = xml_doc.xpath('/message/body/*')
                for measurement in measurements:
                    record = dict()
                    # Measurement data XML structue
                    #   <meting>
                    #    <datum>27/05/2014</datum>
                    #    <tijd>14</tijd>
                    #    <station>549</station>
                    #    <component>PM10</component>
                    #    <eenheid>ug/m3</eenheid>
                    #    <waarde>10</waarde>
                    #    <gevalideerd>0</gevalideerd>
                    #   </meting>

                    record['file_name'] = file_name

                    # station_id variants: '318' or  'NL01485'  or 'NL49551'
                    # always take last three digits?
                    record['station_id'] = measurement.xpath(
                        "station/text()")[0][-3:]
                    record['component'] = measurement.xpath(
                        "component/text()")[0]
                    record['validated'] = measurement.xpath(
                        "gevalideerd/text()")[0]
                    record['sample_value'] = measurement.xpath(
                        "waarde/text()")[0]

                    # 27/05/2014
                    datum = measurement.xpath("datum/text()")[0]
                    # 14
                    tijd = measurement.xpath("tijd/text()")[0]
                    dt_str = datum + '-' + tijd
                    dt = datetime.strptime(dt_str, '%d/%m/%Y-%H')
                    record['sample_time'] = dt

                    # Create a unique id for the sample station-component-time
                    record['sample_id'] = record['station_id'] + '-' + record[
                        'component'] + '-' + dt_str
                    record_list.append(record)

            elif file_format == 'rivm_sos':
                measurements = xml_doc.xpath('/ROWSET/*')

                # Extract date-time string and component name once from file-name
                # 2014070708-PM10.xml becomes date_str=2014070708 component=PM10
                date_str, _, component = file_name[:-4].rpartition('-')

                for measurement in measurements:

                    # first check if there is a valid sample value
                    # for benzeen and e.g. PM2.5 most values are less than zero
                    sample_value = measurement.xpath("MWAA_WAARDE/text()")[0]
                    if sample_value[0:1] == '-':
                        continue

                    record = dict()
                    # <?xml version="1.0"?>
                    # <ROWSET>
                    #  <ROW>
                    #   <OPST_OPDR_ORGA_CODE>DCMR</OPST_OPDR_ORGA_CODE>
                    #   <STAT_NUMMER>NL01483</STAT_NUMMER>
                    #   <STAT_NAAM>Botlek-Spoortunnel</STAT_NAAM>
                    #   <MCLA_CODE>stad verkeer</MCLA_CODE>
                    #   <MWAA_WAARDE>60.3</MWAA_WAARDE>
                    #   <MWAA_BEGINDATUMTIJD>20140707070000</MWAA_BEGINDATUMTIJD>
                    #   <MWAA_EINDDATUMTIJD>20140707080000</MWAA_EINDDATUMTIJD>
                    #  </ROW>
                    #  <ROW>
                    #   <OPST_OPDR_ORGA_CODE>DCMR</OPST_OPDR_ORGA_CODE>
                    #   <STAT_NUMMER>NL01485</STAT_NUMMER>
                    #   <STAT_NAAM>Hoogvliet-Leemkuil</STAT_NAAM>
                    #   <MCLA_CODE>stad achtergr</MCLA_CODE>
                    #   <MWAA_WAARDE>17.5</MWAA_WAARDE>
                    #   <MWAA_BEGINDATUMTIJD>20140707070000</MWAA_BEGINDATUMTIJD>
                    #   <MWAA_EINDDATUMTIJD>20140707080000</MWAA_EINDDATUMTIJD>
                    #  </ROW>

                    record['file_name'] = file_name

                    # station_id variants: '318' or  'NL01485'  or 'NL49551'
                    # always take last three digits?
                    record['station_id'] = measurement.xpath(
                        "STAT_NUMMER/text()")[0][-3:]
                    record['component'] = component
                    record['validated'] = 0
                    record['sample_value'] = sample_value

                    dt = datetime.strptime(date_str, '%Y%m%d%H')
                    record['sample_time'] = dt

                    # Create a unique id for the sample station-component-time
                    record['sample_id'] = record['station_id'] + '-' + record[
                        'component'] + '-' + date_str
                    record_list.append(record)