Ejemplo n.º 1
0
    def init(self):

        self.ogr = ogr
        # http://trac.osgeo.org/gdal/wiki/PythonGotchas
        self.gdal = gdal
        self.gdal.UseExceptions()
        log.info("Using GDAL/OGR version: %d" % int(gdal.VersionInfo('VERSION_NUM')))

        # GDAL error handler function
        # http://pcjericks.github.io/py-gdalogr-cookbook/gdal_general.html
        def gdal_error_handler(err_class, err_num, err_msg):
            err_type = {
                gdal.CE_None: 'None',
                gdal.CE_Debug: 'Debug',
                gdal.CE_Warning: 'Warning',
                gdal.CE_Failure: 'Failure',
                gdal.CE_Fatal: 'Fatal'
            }
            err_msg = err_msg.replace('\n', ' ')
            err_class = err_type.get(err_class, 'None')
            log.error('Error Number: %s, Type: %s, Msg: %s' % (err_num, err_class, err_msg))

        # install error handler
        self.gdal.PushErrorHandler(gdal_error_handler)

        # Raise a dummy error for testing
        # self.gdal.Error(1, 2, 'test error')

        if self.source_options:
            for k in self.source_options:
                self.gdal.SetConfigOption(k, self.source_options[k])

        # Open OGR data source in read-only mode.
        if self.source_format:
            self.data_source_p = ogr.GetDriverByName(self.source_format).Open(self.data_source, 0)
        else:
            self.data_source_p = self.ogr.Open(self.data_source, 0)

        # Report failure if failed
        if self.data_source_p is None:
            log.error("Cannot open OGR datasource: %s with the following drivers." % Util.safe_string_value(self.data_source))

            for iDriver in range(self.ogr.GetDriverCount()):
                log.info("  ->  " + self.ogr.GetDriver(iDriver).GetName())

            raise Exception()
        else:
            # Open ok: initialize
            self.layer = None

            if self.sql:
                self.layer_count = 1
                self.layer_idx = -1
            else:
                self.layer_count = self.data_source_p.GetLayerCount()
                self.layer_idx = 0

            log.info("Opened OGR source ok: %s layer count=%d" % (Util.safe_string_value(self.data_source), self.layer_count))
Ejemplo n.º 2
0
    def execute_cmd(self, cmd):
        env_vars = Util.string_to_dict(self.env_args, self.env_separator)
        old_environ = os.environ.copy()

        try:
            os.environ.update(env_vars)
            log.info("executing cmd=%s" % Util.safe_string_value(cmd))
            subprocess.call(cmd, shell=True)
            log.info("execute done")
        finally:
            os.environ = old_environ
Ejemplo n.º 3
0
    def read(self, packet):
        if not self.data_source_p:
            log.info("End reading from: %s" %
                     Util.safe_string_value(self.data_source))
            return packet

        if self.layer is None:
            if self.sql and self.layer_idx == -1:
                # PostgreSQL: Layer is gotten via Query
                # http://trac.osgeo.org/postgis/wiki/UsersWikiOGR
                self.layer = self.data_source_p.ExecuteSQL(self.sql)
                self.layer_idx = 0
            elif self.layer_idx < self.layer_count:
                self.layer = self.data_source_p.GetLayer(self.layer_idx)
                self.layer_idx += 1
                if self.layer is None:
                    log.error("Could not fetch layer %d" % 0)
                    raise Exception()
                log.info("Start reading from OGR Source: %s, Layer: %s" %
                         (Util.safe_string_value(
                             self.data_source), self.layer.GetName()))
            else:
                # No more Layers left: cleanup
                packet.set_end_of_stream()
                log.info("Closing OGR source: %s" %
                         Util.safe_string_value(self.data_source))
                # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas
                # self.data_source_p.Destroy()
                self.data_source_p = None
                return packet

        # Return all features from Layer (ogr_feature_array) or next feature (ogr_feature)
        if self.output_format == FORMAT.ogr_feature_array:
            # Assemble all features
            features = list()
            for feature in self.layer:
                features.append(feature)

            packet.data = features
            log.info("End reading all features from Layer: %s count=%d" %
                     (self.layer.GetName(), len(features)))
            packet.set_end_of_doc()
            self.layer = None
        else:
            # Next feature
            feature = self.layer.GetNextFeature()
            if feature:
                packet.data = feature
            else:
                log.info("End reading from Layer: %s" % self.layer.GetName())
                packet.set_end_of_doc()
                self.layer = None

        return packet
Ejemplo n.º 4
0
    def execute_cmd(self, cmd):
        env_vars = Util.string_to_dict(self.env_args, self.env_separator)
        old_environ = os.environ.copy()

        try:
            os.environ.update(env_vars)
            log.info("executing cmd=%s" % Util.safe_string_value(cmd))
            subprocess.call(cmd, shell=True)
            log.info("execute done")
        finally:
            os.environ = old_environ
Ejemplo n.º 5
0
    def read(self, packet):
        if not self.data_source_p:
            log.info("End reading from: %s" % Util.safe_string_value(self.data_source))
            return packet

        if self.layer is None:
            if self.sql and self.layer_idx == -1:
                # PostgreSQL: Layer is gotten via Query
                # http://trac.osgeo.org/postgis/wiki/UsersWikiOGR
                self.layer = self.data_source_p.ExecuteSQL(self.sql)
                self.layer_idx = 0
            elif self.layer_idx < self.layer_count:
                self.layer = self.data_source_p.GetLayer(self.layer_idx)
                self.layer_idx += 1
                if self.layer is None:
                    log.error("Could not fetch layer %d" % 0)
                    raise Exception()
                log.info("Start reading from OGR Source: %s, Layer: %s" % (Util.safe_string_value(self.data_source), self.layer.GetName()))
            else:
                # No more Layers left: cleanup
                packet.set_end_of_stream()
                log.info("Closing OGR source: %s" % Util.safe_string_value(self.data_source))
                # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas
                # self.data_source_p.Destroy()
                self.data_source_p = None
                return packet

        # Return all features from Layer (ogr_feature_array) or next feature (ogr_feature)
        if self.output_format == FORMAT.ogr_feature_array:
            # Assemble all features
            features = list()
            for feature in self.layer:
                features.append(feature)

            packet.data = features
            log.info("End reading all features from Layer: %s count=%d" % (self.layer.GetName(), len(features)))
            packet.set_end_of_doc()
            self.layer = None
        else:
            # Next feature
            feature = self.layer.GetNextFeature()
            if feature:
                packet.data = feature
            else:
                log.info("End reading from Layer: %s" % self.layer.GetName())
                packet.set_end_of_doc()
                self.layer = None

        return packet
Ejemplo n.º 6
0
 def write_end(self, packet):
     # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas
     # self.dest_fd.Destroy()
     log.info("End writing to: %s" % Util.safe_string_value(self.dest_data_source))
     self.dest_fd = None
     self.layer = None
     return packet
Ejemplo n.º 7
0
    def write(self, packet):

        # Are we all done?
        if packet.data is None or self.dest_fd is None:
            self.write_end(packet)
            return packet

        if self.layer is None:
            log.info("No Layer, end writing to: %s" %
                     Util.safe_string_value(self.dest_data_source))
            return packet

        # Assume ogr_feature_array input, otherwise convert ogr_feature to list
        if type(packet.data) is list:
            # Write feature collection to OGR Layer output
            for feature in packet.data:
                self.write_feature(feature)

            self.write_end(packet)

        else:
            # Write single feature to OGR Layer output
            if packet.end_of_stream or packet.end_of_doc:
                self.write_end(packet)
                return packet

            self.write_feature(packet.data)

        return packet
Ejemplo n.º 8
0
    def write(self, packet):

        # Are we all done?
        if packet.data is None or self.dest_fd is None:
            self.write_end(packet)
            return packet

        if self.layer is None:
            log.info("No Layer, end writing to: %s" % Util.safe_string_value(self.dest_data_source))
            return packet

        # Assume ogr_feature_array input, otherwise convert ogr_feature to list
        if type(packet.data) is list:
            # Write feature collection to OGR Layer output
            for feature in packet.data:
                self.write_feature(feature)

            self.write_end(packet)

        else:
            # Write single feature to OGR Layer output
            if packet.end_of_stream or packet.end_of_doc:
                self.write_end(packet)
                return packet

            self.write_feature(packet.data)

        return packet
Ejemplo n.º 9
0
 def write_end(self, packet):
     # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas
     # self.dest_fd.Destroy()
     log.info("End writing to: %s" %
              Util.safe_string_value(self.dest_data_source))
     self.dest_fd = None
     self.layer = None
     return packet
Ejemplo n.º 10
0
    def test_make_file_list_depth_search(self):
        # Util.make_file_list
        import sys
        file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'data/depth_search_test')
        filename_pattern = 'dummy.gml'

        # Test with depth_search enabled
        depth_search = True
        file_list = Util.make_file_list(file_path, None, filename_pattern,
                                        depth_search)
        self.assertEqual(len(file_list), 2)

        # Test with depth_search disabled
        depth_search = False
        file_list = Util.make_file_list(file_path, None, filename_pattern,
                                        depth_search)
        self.assertEqual(len(file_list), 1)
Ejemplo n.º 11
0
    def __init__(self, configdict, section):
        StringFilter.__init__(self,
                              configdict,
                              section,
                              consumes=FORMAT.string,
                              produces=FORMAT.string)

        # Convert string to dict: http://stackoverflow.com/a/1248990
        self.format_args_dict = Util.string_to_dict(self.format_args,
                                                    self.separator)
Ejemplo n.º 12
0
    def __init__(self, configdict, section, produces):
        Input.__init__(self, configdict, section, produces)

        # Create the list of files to be used as input
        self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search)
        log.info("file_list=%s" % str(self.file_list))
        if not len(self.file_list):
            raise Exception('File list is empty!!')

        self.file_list_done = []
Ejemplo n.º 13
0
    def __init__(self, configdict, section):
        FileInput.__init__(self, configdict, section, produces=FORMAT.string)
        self.file = None

        # Optional formatting of content according to Python String.format()
        # Input file should have substitutable values like {schema} {foo}
        # format_args should be of the form format_args = schema:test foo:bar
        if self.format_args:
            # Convert string to dict: http://stackoverflow.com/a/1248990
            self.format_args = Util.string_to_dict(self.format_args, ':')
Ejemplo n.º 14
0
    def __init__(self, configdict, section):
        StringFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string)

        # Formatting of content according to Python String.format()
        # String should have substitutable values like {schema} {foo}
        # format_args should be of the form format_args = schema:test foo:bar ...
        self.format_args = self.cfg.get('format_args')

        # Convert string to dict: http://stackoverflow.com/a/1248990
        self.format_args_dict = Util.string_to_dict(self.format_args, ':')
Ejemplo n.º 15
0
    def __init__(self, configdict, section):
        FileInput.__init__(self, configdict, section, produces=FORMAT.string)
        self.file = None

        # Optional formatting of content according to Python String.format()
        # Input file should have substitutable values like {schema} {foo}
        # format_args should be of the form format_args = schema:test foo:bar
        if self.format_args:
            # Convert string to dict: http://stackoverflow.com/a/1248990
            self.format_args = Util.string_to_dict(self.format_args, ':')
Ejemplo n.º 16
0
    def exec_cmd(self):
        log.info("start ogr2ogr cmd = %s" % Util.safe_string_value(repr(self.cmd)))
        self.ogr_process = subprocess.Popen(self.cmd,
                                            shell=False,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE)

        err_line = self.readline_err()
        if err_line:
            log.warning('ogr2ogr: %s ' % err_line)
Ejemplo n.º 17
0
    def exec_cmd(self):
        log.info("start ogr2ogr cmd = %s" %
                 Util.safe_string_value(repr(self.cmd)))
        self.ogr_process = subprocess.Popen(self.cmd,
                                            shell=False,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE)

        err_line = self.readline_err()
        if err_line:
            log.warning('ogr2ogr: %s ' % err_line)
Ejemplo n.º 18
0
    def __init__(self, configdict, section, produces):
        Input.__init__(self, configdict, section, produces)

        # Create the list of files to be used as input
        self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search)
        log.info("file_list=%s" % str(self.file_list))
        if not len(self.file_list):
            raise Exception('File list is empty!!')

        self.cur_file_path = None
        self.file_list_done = []
Ejemplo n.º 19
0
    def execute_cmd(self, cmd):
        env_vars = Util.string_to_dict(self.env_args, self.env_separator)
        old_environ = os.environ.copy()

        try:
            os.environ.update(env_vars)
            log.info("executing cmd=%s" % cmd)
            result = subprocess.check_output(cmd, shell=True)
            log.info("execute done")
            return result
        finally:
            os.environ = old_environ
Ejemplo n.º 20
0
    def execute_cmd(self, cmd):
        env_vars = Util.string_to_dict(self.env_args, self.env_separator)
        old_environ = os.environ.copy()

        try:
            os.environ.update(env_vars)
            log.info("executing cmd=%s" % cmd)
            result = subprocess.check_output(cmd, shell=True)
            log.info("execute done")
            return result
        finally:
            os.environ = old_environ
Ejemplo n.º 21
0
def parse_args(args_list):
    log.info("Stetl version = %s" % __version__)

    argparser = argparse.ArgumentParser(description='Invoke Stetl')
    argparser.add_argument('-c ', '--config', type=str, help='ETL config file in .ini format', dest='config_file',
                           required=False)

    argparser.add_argument('-s ', '--section', type=str, help='Section in the config file to execute, default is [etl]',
                           dest='config_section', required=False)

    argparser.add_argument('-a ', '--args', type=str,
                           help='Arguments or .properties files to be substituted for symbolic {argN}s in Stetl config file,\
                                as -a "arg1=foo arg2=bar" and/or -a args.properties, multiple -a options are possible',
                           dest='config_args', required=False, action='append')

    argparser.add_argument('-d ', '--doc', type=str,
                           help='Get component documentation like its configuration parameters, e.g. stetl doc stetl.inputs.fileinput.FileInput',
                           dest='doc_args', required=False)

    argparser.add_argument('-v', '--version',
                           action='store_true',
                           help='Show current version of stetl and exit',
                           required=False)

    args = argparser.parse_args(args_list)

    if args.config_args:
        args_total = dict()
        for arg in args.config_args:
            if os.path.isfile(arg):
                log.info('Found args file at: %s' % arg)
                args_total = Util.merge_two_dicts(args_total, Util.propsfile_to_dict(arg))
            else:
                # Convert string to dict: http://stackoverflow.com/a/1248990
                args_total = Util.merge_two_dicts(args_total, Util.string_to_dict(arg))

        args.config_args = args_total

    return args
Ejemplo n.º 22
0
 def etree_elem2struct(packet, strip_space=True, strip_ns=True, sub=False, attr_prefix='', gml2ogr=True, ogr2json=True):
     """
     :param packet:
     :param strip_space:
     :param strip_ns:
     :param sub:
     :param attr_prefix:
     :param gml2ogr:
     :param ogr2json:
     :return:
     """
     packet.data = Util.elem_to_dict(packet.data, strip_space, strip_ns, sub, attr_prefix, gml2ogr, ogr2json)
     return packet
Ejemplo n.º 23
0
 def etree_elem2struct(packet, strip_space=True, strip_ns=True, sub=False, attr_prefix='', gml2ogr=True, ogr2json=True):
     """
     :param packet:
     :param strip_space:
     :param strip_ns:
     :param sub:
     :param attr_prefix:
     :param gml2ogr:
     :param ogr2json:
     :return:
     """
     packet.data = Util.elem_to_dict(packet.data, strip_space, strip_ns, sub, attr_prefix, gml2ogr, ogr2json)
     return packet
Ejemplo n.º 24
0
    def process_xml(self, packet):
        while not self.context is None:
            #while not packet.is_end_of_doc():
            try:
                event, elem = self.context.next()
            except (etree.XMLSyntaxError, StopIteration):
                # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701
                self.context = None

            if self.context is None:
                # Always end of doc
                # TODO: is this still useful for a non-input component?
                packet.set_end_of_doc()
                log.info("End of doc: %s elem_count=%d" %
                         (self.cur_file_path, self.elem_count))

                return packet

            # Filter out Namespace from the tag
            # this is the easiest way to go for now
            tag = elem.tag.split('}')
            if len(tag) == 2:
                # Namespaced tag: 2nd is tag
                tag = tag[1]
            else:
                # Non-namespaced tag: first
                tag = tag[0]

            if tag in self.element_tags:
                if event == "start":
                    # TODO check if deepcopy is the right thing to do here.
                    # packet.data = elem
                    pass
                # self.root.remove(elem)
                elif event == "end":
                    # Delete the element from the tree
                    # self.root.clear()
                    packet.data = elem
                    self.elem_count += 1
                    self.root.remove(elem)

                    if self.strip_namespaces:
                        packet.data = Util.stripNamespaces(elem).getroot()

            # If there is a next component, let it process
            if self.next:
                # Hand-over data (line, doc whatever) to the next component
                packet.format = self._output_format
                packet = self.next.process(packet)

        return packet
Ejemplo n.º 25
0
    def __init__(self, configdict, section):
        StringFilter.__init__(self,
                              configdict,
                              section,
                              consumes=FORMAT.string,
                              produces=FORMAT.string)

        # Formatting of content according to Python String.format()
        # String should have substitutable values like {schema} {foo}
        # format_args should be of the form format_args = schema:test foo:bar ...
        self.format_args = self.cfg.get('format_args')

        # Convert string to dict: http://stackoverflow.com/a/1248990
        self.format_args_dict = Util.string_to_dict(self.format_args, ':')
Ejemplo n.º 26
0
    def __init__(self, configdict, section, produces):
        Input.__init__(self, configdict, section, produces)

        # path to file or files: can be a dir or files or even multiple, comma separated
        self.file_path = self.cfg.get('file_path')

        # The filename pattern according to Python glob.glob
        self.filename_pattern = self.cfg.get('filename_pattern', '*.[gxGX][mM][lL]')

        # Recurse into directories ?
        self.depth_search = self.cfg.get_bool('depth_search', False)

        # Create the list of files to be used as input
        self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search)
        log.info("file_list=%s" % str(self.file_list))
Ejemplo n.º 27
0
    def process_xml(self, packet):
        while self.context is not None:
            # while not packet.is_end_of_doc():
            try:
                event, elem = next(self.context)
            except (etree.XMLSyntaxError, StopIteration):
                # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701
                self.context = None

            if self.context is None:
                # Always end of doc
                # TODO: is this still useful for a non-input component?
                packet.set_end_of_doc()
                log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count))

                return packet

            # Filter out Namespace from the tag
            # this is the easiest way to go for now
            tag = elem.tag.split('}')
            if len(tag) == 2:
                # Namespaced tag: 2nd is tag
                tag = tag[1]
            else:
                # Non-namespaced tag: first
                tag = tag[0]

            if tag in self.element_tags:
                if event == "start":
                    pass
                elif event == "end":
                    packet.data = deepcopy(elem)
                    self.elem_count += 1

                    if self.strip_namespaces:
                        packet.data = Util.stripNamespaces(elem).getroot()

                    # Clear the element which has been read. Don't clear the root document,
                    # since the last element hasn't been processed yet.
                    elem.clear()

            # If there is a next component, let it process
            if self.next:
                # Hand-over data (line, doc whatever) to the next component
                packet.format = self._output_format
                packet = self.next.process(packet)

        return packet
Ejemplo n.º 28
0
    def __init__(self, configdict, section, produces):
        Input.__init__(self, configdict, section, produces)

        # path to file or files: can be a dir or files or even multiple, comma separated
        self.file_path = self.cfg.get('file_path')

        # The filename pattern according to Python glob.glob
        self.filename_pattern = self.cfg.get('filename_pattern',
                                             '*.[gxGX][mM][lL]')

        # Recurse into directories ?
        self.depth_search = self.cfg.get_bool('depth_search', False)

        # Create the list of files to be used as input
        self.file_list = Util.make_file_list(self.file_path, None,
                                             self.filename_pattern,
                                             self.depth_search)
        log.info("file_list=%s" % str(self.file_list))
Ejemplo n.º 29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# POST data via WFS Transactional protocol (WFS-T).
#
# Author: Just van den Broecke
#
from stetl.component import Config
from stetl.output import Output
from stetl.util import Util
from stetl.packet import FORMAT
import httplib

log = Util.get_log('wfsoutput')


class WFSTOutput(Output):
    """
    Insert features via WFS-T (WFS Transaction) OGC protocol from an etree doc.

    consumes=FORMAT.etree_doc
    """

    # Start attribute config meta
    @Config(ptype=str, required=True, default=None)
    def wfs_host(self):
        """
        Hostname-part of URL e.g. geodata.ngr.nl.
        """
        pass
Ejemplo n.º 30
0
# -*- coding: utf-8 -*-
#
# Writes the payload of a packet as a string to a file.
# Based on outputs.fileoutput.FileOutput.
#
# Author: Frank Steggink
#
from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util
from stetl.packet import FORMAT

import os

log = Util.get_log('packetwriter')


class PacketWriter(Filter):
    """
    Writes the payload of a packet as a string to a file.

    consumes=FORMAT.any, produces=FORMAT.string
    """

    # Start attribute config meta
    @Config(ptype=str, default=None, required=True)
    def file_path(self):
        """
        File path to write content to.
        """
        pass
Ejemplo n.º 31
0
#!/usr/bin/env python
#
# Extracts arrays of etree GML features from an GML etree document.
#
# Author: Just van den Broecke
#
from stetl.util import Util
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log('gmlfeatureextractor')


class GmlFeatureExtractor(Filter):
    """
    Extract arrays of GML features etree elements from etree docs.

    consumes=FORMAT.etree_doc, produces=FORMAT.etree_feature_array
    """

    # XPATH Query base for extracting features by (non-namespaced thus local-name) tagname
    xpath_base = "//*[local-name() = '%s']"

    # Constructor
    def __init__(self, configdict, section='gml_feature_extractor'):
        Filter.__init__(self, configdict, section, consumes=FORMAT.etree_doc, produces=FORMAT.etree_feature_array)

        log.info("cfg = %s" % self.cfg.to_string())

        # Build the Xpath expresion from configures tagnames
        self.feature_tags = self.cfg.get('feature_tags').split(',')
Ejemplo n.º 32
0
# -*- coding: utf-8 -*-
#
# MeasurementsDbInput: Reads SmartEm raw AQ/LML file data from measurements table and converts to recordlist
#
# Author:Just van den Broecke

from stetl.util import Util, etree
from stetl.inputs.dbinput import PostgresDbInput
from stetl.packet import FORMAT
from stetl.postgis import PostGIS

from datetime import datetime

log = Util.get_log("MeasurementsDbInput")


class MeasurementsDbInput(PostgresDbInput):
    """
    Reads SmartEm raw AQ/LML file data from measurements table and converts to recordlist
    """
    def __init__(self, configdict, section):
        PostgresDbInput.__init__(self, configdict, section)
        self.progress_query = self.cfg.get('progress_query')
        self.progress_update = self.cfg.get('progress_update')
        self.db = None

    def after_chain_invoke(self, packet):
        """
        Called right after entire Component Chain invoke.
        Used to update last id of processed file record.
        """
Ejemplo n.º 33
0
#!/usr/bin/env python
#
# Splits stream of GML lines into etree docs.
#
# Author: Just van den Broecke
#
import codecs
from deprecated.sphinx import deprecated
from stetl.util import Util, etree, StringIO
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log('gmlsplitter')


@deprecated(
    version='1.0.4',
    reason=
    'Use the more robust XmlElementStreamerFileInput + XmlAssembler instead!!!'
)
class GmlSplitter(Filter):
    """
    Split a stream of text XML lines into documents
    TODO phase out

    consumes=FORMAT.xml_line_stream, produces=FORMAT.etree_doc
    """
    def __init__(self, configdict, section='gml_splitter'):
        Filter.__init__(self,
                        configdict,
                        section,
Ejemplo n.º 34
0
# Output classes for ETL with SensorThings API.
#
# Author: Just van den Broecke
#

from os import path
import requests
import json
import base64

from stetl.util import Util
from stetl.packet import FORMAT
from stetl.component import Config
from stetl.outputs.httpoutput import HttpOutput

log = Util.get_log('staoutput')


class STAOutput(HttpOutput):
    """
    Output via SensorThings API (STA) over plain HTTP using the HttpOutput base class.
    See examples: http://www.sensorup.com/docs/?python

    consumes=FORMAT.record_array

    """
    @Config(ptype=str,
            default='application/json;charset=UTF-8',
            required=False)
    def content_type(self):
        """
Ejemplo n.º 35
0
# -*- coding: utf-8 -*-
#
# Output classes for ETL, databases.
#
# Author: Just van den Broecke
#
from stetl.output import Output
from stetl.util import Util
from stetl.packet import FORMAT
from stetl.component import Config
from stetl.postgis import PostGIS

log = Util.get_log('dboutput')


class DbOutput(Output):
    """
    Output to any database (abstract base class).
    """

    def __init__(self, configdict, section, consumes):
        Output.__init__(self, configdict, section, consumes)

    def write(self, packet):
        return packet


class PostgresDbOutput(DbOutput):
    """
    Output to PostgreSQL database.
    Input is an SQL string.
Ejemplo n.º 36
0
#!/usr/bin/env python
#
# Converts Stetl Packet FORMATs. This can be used to connect
# Stetl components with different output/input formats.
#
# Author:Just van den Broecke

import json
from stetl.component import Config
from stetl.util import Util, etree
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log("formatconverter")


class FormatConverter(Filter):
    """
    Converts (almost) any packet format (if converter available).

    consumes=FORMAT.any, produces=FORMAT.any but actual formats
    are changed at initialization based on the input to output format to
    be converted via the input_format and output_format config parameters.
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.

    @Config(ptype=dict, default=None, required=False)
    def converter_args(self):
Ejemplo n.º 37
0
# -*- coding: utf-8 -*-
#
# Reads an XML file and returns XML elements.
# Based on inputs.fileinput.XmlElementStreamFileInput.
#
# Author: Frank Steggink
#
from copy import deepcopy

from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util, etree
from stetl.packet import FORMAT

log = Util.get_log('xmlelementreader')


class XmlElementReader(Filter):
    """
    Extracts XML elements from a file, outputs each feature element in Packet.
    Parsing is streaming (no internal DOM buildup) so any file size can be handled.
    Use this class for your big GML files!

    consumes=FORMAT.string, produces=FORMAT.etree_element
    """

    # Start attribute config meta
    @Config(ptype=list, default=None, required=True)
    def element_tags(self):
        """
        Comma-separated string of XML (feature) element tag names of the elements that should be extracted
Ejemplo n.º 38
0
# Packet buffering.
#
# Author:Just van den Broecke

import copy
from stetl.util import Util
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log("packetbuffer")


class PacketBuffer(Filter):
    """
    Buffers all incoming Packets, main use is unit-testing to inspect Packets after ETL is done.
    """

    # Constructor
    def __init__(self, configdict, section):
        Filter.__init__(self, configdict, section, consumes=FORMAT.any, produces=FORMAT.any)
        self.packet_list = []

    def invoke(self, packet):
        # Buffer Packet and pass-through, we need a deep copy as Packets may be cleared/reused
        self.packet_list.append(copy.copy(packet))
        return packet
Ejemplo n.º 39
0
# -*- coding: utf-8 -*-
#
# Input classes for ETL, Files.
#
# Author: Just van den Broecke
#
from stetl.input import Input
from stetl.util import Util, etree
from stetl.packet import FORMAT

log = Util.get_log('fileinput')

class FileInput(Input):
    """
    Abstract base class for specific FileInputs.
    """

    def __init__(self, configdict, section, produces):
        Input.__init__(self, configdict, section, produces)

        # path to file or files: can be a dir or files or even multiple, comma separated
        self.file_path = self.cfg.get('file_path')

        # The filename pattern according to Python glob.glob
        self.filename_pattern = self.cfg.get('filename_pattern', '*.[gxGX][mM][lL]')

        # Recurse into directories ?
        self.depth_search = self.cfg.get_bool('depth_search', False)

        # Create the list of files to be used as input
        self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search)
Ejemplo n.º 40
0
# -*- coding: utf-8 -*-
#
# Reads an XML file and returns XML elements.
# Based on inputs.fileinput.XmlElementStreamFileInput.
#
# Author: Frank Steggink
#
from copy import deepcopy

from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util, etree
from stetl.packet import FORMAT

log = Util.get_log('xmlelementreader')


class XmlElementReader(Filter):
    """
    Extracts XML elements from a file, outputs each feature element in Packet.
    Parsing is streaming (no internal DOM buildup) so any file size can be handled.
    Use this class for your big GML files!

    consumes=FORMAT.string, produces=FORMAT.etree_element
    """

    # Start attribute config meta
    @Config(ptype=list, default=None, required=True)
    def element_tags(self):
        """
        Comma-separated string of XML (feature) element tag names of the elements that should be extracted
Ejemplo n.º 41
0
#
# Author: Pieter Marsman - 2016

import sys
import traceback
from stetl.component import Config
from stetl.filter import Filter
from stetl.inputs.dbinput import PostgresDbInput
from stetl.packet import FORMAT
from stetl.util import Util

from dateutil import parser

from sensordefs import *

log = Util.get_log("Extractor")


class ExtractFilter(Filter):
    """
    Filter to consume single raw record with sensor (single hour) timeseries values and extract these for each component.
    Input is a single timeseries record for a single hour with all sensorvalues for a single device within that hour.
    """
    @Config(ptype=list, default=[], required=True)
    def sensor_names(self):
        """
        The output sensor names to extract.

        Required: True

        Default: []
Ejemplo n.º 42
0
# Transformation of any input using Python Templating as
# meant in: https://wiki.python.org/moin/Templating.
# A TemplatingFilter typically is configured with a template file.
# The input is typically the Template context, the variables to be substituted.
# The output is a string passed to the next Filter or Output.
#
# Author:Just van den Broecke

from stetl.util import Util, ogr, osr
from stetl.component import Config
from stetl.filter import Filter
from stetl.packet import FORMAT
from string import Template
import os

log = Util.get_log("templatingfilter")


class TemplatingFilter(Filter):
    """
    Abstract base class for specific template-based filters.
    See https://wiki.python.org/moin/Templating
    Subclasses implement a specific template language like Python string.Template, Mako, Genshi, Jinja2,

    consumes=FORMAT.any, produces=FORMAT.string
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.
Ejemplo n.º 43
0
    def __init__(self, configdict, section):
        StringFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string)

        # Convert string to dict: http://stackoverflow.com/a/1248990
        self.format_args_dict = Util.string_to_dict(self.format_args, self.separator)
Ejemplo n.º 44
0
#
# Filter that deals with subfeatures in BGT GML files.
#
# Author: Frank Steggink

import os

from copy import deepcopy
# We need specifically lxml, because of the incremental XML generation
from lxml import etree
from stetl.component import Config
from stetl.filter import Filter
from stetl.packet import FORMAT
from stetl.util import Util

log = Util.get_log("subfeaturehandler")


class SubFeatureHandler(Filter):
    """
    This filter checks whether the data file contains the given parent features. If this is the case, the parent feature
    and subfeatures are split into different features.
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.

    @Config(ptype=str, default=None, required=True)
    def temp_file(self):
        """
Ejemplo n.º 45
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Output Components for deegree server storage (www.deegree.org).
#
# Author: Just van den Broecke
#
# NB deegree also supports WFS-T!
#
from stetl.postgis import PostGIS
from stetl.output import Output
from stetl.util import Util, etree
from stetl.packet import FORMAT
import os

log = Util.get_log('deegreeoutput')

class DeegreeBlobstoreOutput(Output):
    """
    Insert features into deegree Blobstore from an etree doc.

    consumes=FORMAT.etree_doc
    """
    def __init__(self, configdict, section):
        Output.__init__(self, configdict, section, consumes=FORMAT.etree_doc)
        self.overwrite = self.cfg.get_bool('overwrite')
        self.srid = self.cfg.get_int('srid', -1)
        self.feature_member_tag = self.cfg.get('feature_member_tag')
        self.feature_type_ids = {}

    def init(self):
Ejemplo n.º 46
0
# -*- coding: utf-8 -*-
#
# Filter that does noting, just passes any data through.
#
# Author:Just van den Broecke

from stetl.util import Util
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log("nullfilter")


class NullFilter(Filter):
    """
    Pass-through Filter, does nothing. Mainly used in Test Cases.
    """

    # Constructor
    def __init__(self, configdict, section, consumes=FORMAT.any, produces=FORMAT.any):
        Filter.__init__(self, configdict, section, consumes, produces)

    def invoke(self, packet):
        return packet
Ejemplo n.º 47
0
#!/usr/bin/env python
#
# Transformation of an etree doc with XSLT.
#
# Author:Just van den Broecke

from stetl.component import Config
from stetl.util import Util, etree
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log("xsltfilter")


class XsltFilter(Filter):
    """
    Invokes XSLT processor (via lxml) for given XSLT script on an etree doc.

    consumes=FORMAT.etree_doc, produces=FORMAT.etree_doc
    """
    @Config(ptype=str, required=True)
    def script(self):
        """
        Path to XSLT script file.
        """
        pass

    # Constructor
    def __init__(self, configdict, section):
        Filter.__init__(self,
                        configdict,
Ejemplo n.º 48
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Extracts data from a string using a regular expression and generates a record.
#
# Author: Frank Steggink

from stetl.component import Config
from stetl.filter import Filter
from stetl.packet import FORMAT
from stetl.util import Util
import re

log = Util.get_log("regexfilter")


class RegexFilter(Filter):
    """
    Extracts data from a string using a regular expression and returns the named groups as a record.
    consumes=FORMAT.string, produces=FORMAT.record
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.

    @Config(ptype=str, default=None, required=True)
    def pattern_string(self):
        """
        Regex pattern string. Should contain named groups.
        """
Ejemplo n.º 49
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Converts Stetl Packet FORMATs. This can be used to connect
# Stetl components with different output/input formats.
#
# Author:Just van den Broecke

from stetl.component import Config
from stetl.util import Util, etree
from stetl.filter import Filter
from stetl.packet import FORMAT
import json

log = Util.get_log("formatconverter")


class FormatConverter(Filter):
    """
    Converts (almost) any packet format (if converter available).

    consumes=FORMAT.any, produces=FORMAT.any but actual formats
    are changed at initialization based on the input to output format to
    be converted via the input_format and output_format config parameters.
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.

    @Config(ptype=dict, default=None, required=False)
Ejemplo n.º 50
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Input classes for ETL via GDAL OGR.
#
# Author: Just van den Broecke
#
import subprocess
from stetl.component import Config
from stetl.util import Util, gdal, ogr
from stetl.input import Input
from stetl.packet import FORMAT

log = Util.get_log('ogrinput')


class OgrInput(Input):
    """
    Direct GDAL OGR input via Python OGR wrapper. Via the Python API http://gdal.org/python
    an OGR data source is accessed and from each layer the Features are read.
    Each Layer corresponds to a "doc", so for multi-layer sources the 'end-of-doc' flag is
    set after a Layer has been read.

    This input can read almost any geospatial dataformat. One can use the features directly
    in a Stetl Filter or use a converter to e.g. convert to GeoJSON structures.

    produces=FORMAT.ogr_feature or FORMAT.ogr_feature_array (all features)
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
Ejemplo n.º 51
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Splits stream of XML elements into etree docs.
#
# Author: Just van den Broecke
#
from stetl.util import Util, etree
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log('xmlassembler')


class XmlAssembler(Filter):
    """
    Split a stream of etree DOM XML elements (usually Features) into etree DOM docs.
    Consumes and buffers elements until max_elements reached, will then produce an etree doc.

    consumes=FORMAT.etree_element_stream, produces=FORMAT.etree_doc
    """
    xpath_base = "//*[local-name() = '%s']"

    # Constructor
    def __init__(self, configdict, section):
        Filter.__init__(self,
                        configdict,
                        section,
                        consumes=FORMAT.etree_element_stream,
                        produces=FORMAT.etree_doc)
Ejemplo n.º 52
0
# -*- coding: utf-8 -*-
#
# Output to File classes.
#
# Author: Just van den Broecke
#
from stetl.output import Output
from stetl.util import Util
from stetl.packet import FORMAT

import os

log = Util.get_log('fileoutput')

class FileOutput(Output):
    """
    Pretty print XML to file from an etree doc.

    consumes=FORMAT.etree_doc
    """

    def __init__(self, configdict, section):
        Output.__init__(self, configdict, section, consumes=FORMAT.etree_doc)
        log.info("working dir %s" % os.getcwd())

    def write(self, packet):
        if packet.data is None:
            return packet

        file_path = self.cfg.get('file_path')
        return self.write_file(packet, file_path)
Ejemplo n.º 53
0
#!/usr/bin/env python
#
# Extracts data from a string using a regular expression and generates a record.
#
# Author: Frank Steggink
import re

from stetl.component import Config
from stetl.filter import Filter
from stetl.packet import FORMAT
from stetl.util import Util

log = Util.get_log("regexfilter")


class RegexFilter(Filter):
    """
    Extracts data from a string using a regular expression and returns the named groups as a record.
    consumes=FORMAT.string, produces=FORMAT.record
    """

    # Start attribute config meta
    # Applying Decorator pattern with the Config class to provide
    # read-only config values from the configured properties.

    @Config(ptype=str, default=None, required=True)
    def pattern_string(self):
        """
        Regex pattern string. Should contain named groups.
        """
        pass
Ejemplo n.º 54
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Output classes for ETL.
#
# Author: Just van den Broecke
#
from os import sys, path
from stetl.outputs.httpoutput import HttpOutput
from stetl.util import Util
from stetl.packet import FORMAT
from stetl.component import Config
log = Util.get_log('sosoutput')

class SOSTOutput(HttpOutput):
    """
    Output via SOS-T protocol over plain HTTP.

    consumes=FORMAT.record_array
    """

    @Config(ptype=str, default='application/json;charset=UTF-8', required=True)
    def content_type(self):
        """
        The content type (for template).

        Required: True

        Default: application/json;charset=UTF-8
        """
        pass
Ejemplo n.º 55
0
# -*- coding: utf-8 -*-
#
# String filtering.
#
# Author:Just van den Broecke

from stetl.component import Config
from stetl.util import Util
from stetl.filter import Filter
from stetl.packet import FORMAT

log = Util.get_log("stringfilter")


class StringFilter(Filter):
    """
    Base class for any string filtering
    """

    # Constructor
    def __init__(self, configdict, section, consumes, produces):
        Filter.__init__(self, configdict, section, consumes, produces)

    def invoke(self, packet):
        if packet.data is None:
            return packet
        return self.filter_string(packet)

    def filter_string(self, packet):
        pass
Ejemplo n.º 56
0
# -*- coding: utf-8 -*-
#
# Extracts a file from a ZIP file, and saves it as the given file name.
#
# Author: Frank Steggink
#
from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util
from stetl.packet import FORMAT

log = Util.get_log('zipfileextractor')

BUFFER_SIZE = 1024 * 1024 * 1024


class ZipFileExtractor(Filter):
    """
    Extracts a file from a ZIP file, and saves it as the given file name.

    consumes=FORMAT.record, produces=FORMAT.string
    """

    # Start attribute config meta
    @Config(ptype=str, default=None, required=True)
    def file_path(self):
        """
        File name to write the extracted file to.
        """
        pass
Ejemplo n.º 57
0
# -*- coding: utf-8 -*-
#
# Writes the payload of a packet as a string to a file.
# Based on outputs.fileoutput.FileOutput.
#
# Author: Frank Steggink
#
from stetl.component import Config
from stetl.filter import Filter
from stetl.util import Util
from stetl.packet import FORMAT

import os

log = Util.get_log('packetwriter')


class PacketWriter(Filter):
    """
    Writes the payload of a packet as a string to a file.

    consumes=FORMAT.any, produces=FORMAT.string
    """

    # Start attribute config meta
    @Config(ptype=str, default=None, required=True)
    def file_path(self):
        """
        File path to write content to.

        Required: True