Example #1
0
    def __init__(self, table_name, args, dest_table):

        # compat for dest-table
        dest_table = args.get('table', dest_table)

        BaseHandler.__init__(self, table_name, args, dest_table)

        # show args
        self.log.debug("dispatch.init: table_name=%r, args=%r" % \
                       (table_name, args))
        self.batch_info = None
        self.dst_curs = None
        self.pkeys = None
        # config
        self.conf = self.get_config()
        hdlr_cls = ROW_HANDLERS[self.conf.row_mode]
        self.row_handler = hdlr_cls(self.log)
        if self.conf.encoding:
            self.encoding_validator = EncodingValidator(
                self.log, self.conf.encoding)
        else:
            self.encoding_validator = None
Example #2
0
    def __init__(self, table_name, args, dest_table):

        # compat for dest-table
        dest_table = args.get('table', dest_table)

        BaseHandler.__init__(self, table_name, args, dest_table)

        # show args
        self.log.debug("dispatch.init: table_name=%r, args=%r", table_name, args)
        self.batch_info = None
        self.dst_curs = None
        self.pkeys = None
        # config
        self.conf = self.get_config()
        hdlr_cls = ROW_HANDLERS[self.conf.row_mode]
        self.row_handler = hdlr_cls(self.log)
        if self.conf.encoding:
            self.encoding_validator = EncodingValidator(self.log,
                                                        self.conf.encoding)
        else:
            self.encoding_validator = None
Example #3
0
class Dispatcher(BaseHandler):
    """Partitioned loader.
    Splits events into partitions, if requested.
    Then applies them without further processing.
    """
    handler_name = 'dispatch'

    def __init__(self, table_name, args, dest_table):

        # compat for dest-table
        dest_table = args.get('table', dest_table)

        BaseHandler.__init__(self, table_name, args, dest_table)

        # show args
        self.log.debug("dispatch.init: table_name=%r, args=%r" % \
                       (table_name, args))
        self.batch_info = None
        self.dst_curs = None
        self.pkeys = None
        # config
        self.conf = self.get_config()
        hdlr_cls = ROW_HANDLERS[self.conf.row_mode]
        self.row_handler = hdlr_cls(self.log)
        if self.conf.encoding:
            self.encoding_validator = EncodingValidator(
                self.log, self.conf.encoding)
        else:
            self.encoding_validator = None

    def get_config(self):
        """Processes args dict"""
        conf = skytools.dbdict()
        # set table mode
        conf.table_mode = self.get_arg('table_mode', TABLE_MODES)
        conf.analyze = self.get_arg('analyze', [0, 1])
        if conf.table_mode == 'part':
            conf.part_mode = self.get_arg('part_mode', PART_MODES)
            conf.part_field = self.args.get('part_field')
            if conf.part_mode == 'date_field' and not conf.part_field:
                raise Exception('part_mode date_field requires part_field!')
            conf.period = self.get_arg('period', PERIODS)
            conf.part_name = self.args.get('part_name')
            conf.part_template = self.args.get('part_template')
            conf.pre_part = self.args.get('pre_part')
            conf.post_part = self.args.get('post_part')
            conf.part_func = self.args.get('part_func', PART_FUNC_NEW)
        # set row mode and event types to process
        conf.row_mode = self.get_arg('row_mode', ROW_MODES)
        event_types = self.args.get('event_types', '*')
        if event_types == '*':
            event_types = EVENT_TYPES
        else:
            event_types = [evt.upper() for evt in event_types.split(',')]
            for evt in event_types:
                if evt not in EVENT_TYPES:
                    raise Exception('Unsupported operation: %s' % evt)
        conf.event_types = event_types
        # set load handler
        conf.load_mode = self.get_arg('load_mode', LOAD_MODES)
        conf.method = self.get_arg('method', METHODS)
        # fields to skip
        conf.skip_fields = [
            f.strip().lower()
            for f in self.args.get('skip_fields', '').split(',')
        ]
        # get fields map (obsolete, for compatibility reasons)
        fields = self.args.get('fields', '*')
        if fields == "*":
            conf.field_map = None
        else:
            conf.field_map = {}
            for fval in fields.split(','):
                tmp = fval.split(':')
                if len(tmp) == 1:
                    conf.field_map[tmp[0]] = tmp[0]
                else:
                    conf.field_map[tmp[0]] = tmp[1]
        # encoding validator
        conf.encoding = self.args.get('encoding')
        return conf

    def get_arg(self, name, value_list, default=None):
        default = default or value_list[0]
        val = type(default)(self.args.get(name, default))
        if val not in value_list:
            raise Exception('Bad argument %s value %r' % (name, val))
        return val

    def reset(self):
        """Called before starting to process a batch.
        Should clean any pending data."""
        BaseHandler.reset(self)

    def prepare_batch(self, batch_info, dst_curs):
        """Called on first event for this table in current batch."""
        if self.conf.table_mode != 'ignore':
            self.batch_info = batch_info
            self.dst_curs = dst_curs
        #BaseHandler.prepare_batch(self, batch_info, dst_curs)

    def filter_data(self, data):
        """Process with fields skip and map"""
        fskip = self.conf.skip_fields
        fmap = self.conf.field_map
        if fskip:
            data = dict((k, v) for k, v in data.items() if k not in fskip)
        if fmap:
            # when field name not present in source is used then  None (NULL)
            # value is inserted. is it ok?
            data = dict((v, data.get(k)) for k, v in fmap.items())
        return data

    def filter_pkeys(self, pkeys):
        """Process with fields skip and map"""
        fskip = self.conf.skip_fields
        fmap = self.conf.field_map
        if fskip:
            pkeys = [f for f in pkeys if f not in fskip]
        if fmap:
            pkeys = [fmap[p] for p in pkeys if p in fmap]
        return pkeys

    def process_event(self, ev, sql_queue_func, arg):
        """Process a event.
        Event should be added to sql_queue or executed directly.
        """
        if self.conf.table_mode == 'ignore':
            return
        # get data
        data = skytools.db_urldecode(ev.data)
        if self.encoding_validator:
            data = self.encoding_validator.validate_dict(data, self.table_name)
        if len(ev.ev_type) < 2 or ev.ev_type[1] != ':':
            raise Exception('Unsupported event type: %s/extra1=%s/data=%s' %
                            (ev.ev_type, ev.ev_extra1, ev.ev_data))
        op, pkeys = ev.type.split(':', 1)
        if op not in 'IUD':
            raise Exception('Unknown event type: %s' % ev.ev_type)
        # process only operations specified
        if not op in self.conf.event_types:
            return
        self.log.debug('dispatch.process_event: %s/%s' %
                       (ev.ev_type, ev.ev_data))
        if self.pkeys is None:
            self.pkeys = self.filter_pkeys(pkeys.split(','))
        data = self.filter_data(data)
        # prepare split table when needed
        if self.conf.table_mode == 'part':
            dst, part_time = self.split_format(ev, data)
            if dst not in self.row_handler.table_map:
                self.check_part(dst, part_time)
        else:
            dst = self.dest_table

        if dst not in self.row_handler.table_map:
            self.row_handler.add_table(dst, LOADERS[self.conf.load_mode],
                                       self.pkeys, self.conf)
        self.row_handler.process(dst, op, data)
        #BaseHandler.process_event(self, ev, sql_queue_func, arg)

    def finish_batch(self, batch_info, dst_curs):
        """Called when batch finishes."""
        if self.conf.table_mode != 'ignore':
            self.row_handler.flush(dst_curs)
        #BaseHandler.finish_batch(self, batch_info, dst_curs)

    def get_part_name(self):
        # if custom part name template given, use it
        if self.conf.part_name:
            return self.conf.part_name
        parts = ['year', 'month', 'day', 'hour']
        name_parts = ['parent'] + parts[:parts.index(self.conf.period) + 1]
        return '_'.join('%%(%s)s' % part for part in name_parts)

    def split_format(self, ev, data):
        """Generates part table name from template"""
        if self.conf.part_mode == 'batch_time':
            dtm = self.batch_info['batch_end']
        elif self.conf.part_mode == 'event_time':
            dtm = ev.ev_time
        elif self.conf.part_mode == 'current_time':
            dtm = datetime.datetime.now()
        elif self.conf.part_mode == 'date_field':
            dt_str = data[self.conf.part_field]
            if dt_str is None:
                raise Exception('part_field(%s) is NULL: %s' %
                                (self.conf.part_field, ev))
            dtm = datetime.datetime.strptime(dt_str[:19], "%Y-%m-%d %H:%M:%S")
        else:
            raise UsageError('Bad value for part_mode: %s' %\
                    self.conf.part_mode)
        vals = {
            'parent': self.dest_table,
            'year': "%04d" % dtm.year,
            'month': "%02d" % dtm.month,
            'day': "%02d" % dtm.day,
            'hour': "%02d" % dtm.hour,
        }
        return (self.get_part_name() % vals, dtm)

    def check_part(self, dst, part_time):
        """Create part table if not exists.

        It part_template present, execute it
        else if part function present in db, call it
        else clone master table"""
        curs = self.dst_curs
        if skytools.exists_table(curs, dst):
            return
        dst = quote_fqident(dst)
        vals = {
            'dest': dst,
            'part': dst,
            'parent': self.fq_dest_table,
            'pkeys': ",".join(self.pkeys),  # quoting?
            # we do this to make sure that constraints for
            # tables who contain a schema will still work
            'schema_table': dst.replace(".", "__"),
            'part_field': self.conf.part_field,
            'part_time': part_time,
            'period': self.conf.period,
        }

        def exec_with_vals(tmpl):
            if tmpl:
                sql = tmpl % vals
                curs.execute(sql)
                return True
            return False

        exec_with_vals(self.conf.pre_part)

        if not exec_with_vals(self.conf.part_template):
            self.log.debug('part_template not provided, using part func')
            # if part func exists call it with val arguments
            pfargs = ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS)

            # set up configured function
            pfcall = 'select %s(%s)' % (self.conf.part_func, pfargs)
            have_func = skytools.exists_function(curs, self.conf.part_func,
                                                 len(PART_FUNC_ARGS))

            # backwards compat
            if not have_func and self.conf.part_func == PART_FUNC_NEW:
                pfcall = 'select %s(%s)' % (PART_FUNC_OLD, pfargs)
                have_func = skytools.exists_function(curs, PART_FUNC_OLD,
                                                     len(PART_FUNC_ARGS))

            if have_func:
                self.log.debug('check_part.exec: func:%s, args: %s' %
                               (pfcall, vals))
                curs.execute(pfcall, vals)
            else:
                #
                # Otherwise crete simple clone.
                #
                # FixMe: differences from create_partitions():
                # - check constraints
                # - inheritance
                #
                self.log.debug('part func %s not found, cloning table' %
                               self.conf.part_func)
                struct = TableStruct(curs, self.dest_table)
                struct.create(curs, T_ALL, dst)

        exec_with_vals(self.conf.post_part)
        self.log.info("Created table: %s" % dst)

    def real_copy(self, tablename, src_curs, dst_curs, column_list):
        """do actual table copy and return tuple with number of bytes and rows
        copyed
        """
        _src_cols = _dst_cols = column_list
        condition = ''

        if self.conf.skip_fields:
            _src_cols = [
                col for col in column_list if col not in self.conf.skip_fields
            ]
            _dst_cols = _src_cols

        if self.conf.field_map:
            _src_cols = [
                col for col in _src_cols if col in self.conf.field_map
            ]
            _dst_cols = [self.conf.field_map[col] for col in _src_cols]

        if self.encoding_validator:

            def _write_hook(obj, data):
                return self.encoding_validator.validate_copy(
                    data, _src_cols, tablename)
        else:
            _write_hook = None

        return skytools.full_copy(tablename,
                                  src_curs,
                                  dst_curs,
                                  _src_cols,
                                  condition,
                                  dst_tablename=self.dest_table,
                                  dst_column_list=_dst_cols,
                                  write_hook=_write_hook)
Example #4
0
class Dispatcher(BaseHandler):
    """Partitioned loader.
    Splits events into partitions, if requested.
    Then applies them without further processing.
    """
    handler_name = 'dispatch'

    def __init__(self, table_name, args, dest_table):

        # compat for dest-table
        dest_table = args.get('table', dest_table)

        BaseHandler.__init__(self, table_name, args, dest_table)

        # show args
        self.log.debug("dispatch.init: table_name=%r, args=%r" % \
                       (table_name, args))
        self.batch_info = None
        self.dst_curs = None
        self.pkeys = None
        # config
        self.conf = self.get_config()
        hdlr_cls = ROW_HANDLERS[self.conf.row_mode]
        self.row_handler = hdlr_cls(self.log)
        if self.conf.encoding:
            self.encoding_validator = EncodingValidator(self.log,
                                                        self.conf.encoding)
        else:
            self.encoding_validator = None

    def _parse_args_from_doc (self):
        doc = __doc__
        params_descr = []
        params_found = False
        for line in doc.splitlines():
            ln = line.strip()
            if params_found:
                if ln.startswith("=="):
                    break
                m = re.match ("^(\w+):$", ln)
                if m:
                    name = m.group(1)
                    expr = text = ""
                elif not params_descr:
                    continue
                else:
                    name, expr, text = params_descr.pop()
                    text += ln + "\n"
                params_descr.append ((name, expr, text))
            elif ln == "== HANDLER ARGUMENTS ==":
                params_found = True
        return params_descr

    def get_config(self):
        """Processes args dict"""
        conf = skytools.dbdict()
        # set table mode
        conf.table_mode = self.get_arg('table_mode', TABLE_MODES)
        conf.analyze = self.get_arg('analyze', [0, 1])
        if conf.table_mode == 'part':
            conf.part_mode = self.get_arg('part_mode', PART_MODES)
            conf.part_field = self.args.get('part_field')
            if conf.part_mode == 'date_field' and not conf.part_field :
                raise Exception('part_mode date_field requires part_field!')
            conf.period = self.get_arg('period', PERIODS)
            conf.part_name = self.args.get('part_name')
            conf.part_template = self.args.get('part_template')
            conf.pre_part = self.args.get('pre_part')
            conf.post_part = self.args.get('post_part')
            conf.part_func = self.args.get('part_func', PART_FUNC_NEW)
        # set row mode and event types to process
        conf.row_mode = self.get_arg('row_mode', ROW_MODES)
        event_types = self.args.get('event_types', '*')
        if event_types == '*':
            event_types = EVENT_TYPES
        else:
            event_types = [evt.upper() for evt in event_types.split(',')]
            for evt in event_types:
                if evt not in EVENT_TYPES:
                    raise Exception('Unsupported operation: %s' % evt)
        conf.event_types = event_types
        # set load handler
        conf.load_mode = self.get_arg('load_mode', LOAD_MODES)
        conf.method = self.get_arg('method', METHODS)
        # fields to skip
        conf.skip_fields = [f.strip().lower()
                for f in self.args.get('skip_fields','').split(',')]
        # get fields map (obsolete, for compatibility reasons)
        fields = self.args.get('fields', '*')
        if  fields == "*":
            conf.field_map = None
        else:
            conf.field_map = {}
            for fval in fields.split(','):
                tmp = fval.split(':')
                if len(tmp) == 1:
                    conf.field_map[tmp[0]] = tmp[0]
                else:
                    conf.field_map[tmp[0]] = tmp[1]
        # encoding validator
        conf.encoding = self.args.get('encoding')
        return conf

    def get_arg(self, name, value_list, default = None):
        default = default or value_list[0]
        val = type(default)(self.args.get(name, default))
        if val not in value_list:
            raise Exception('Bad argument %s value %r' % (name, val))
        return val

    def reset(self):
        """Called before starting to process a batch.
        Should clean any pending data."""
        BaseHandler.reset(self)

    def prepare_batch(self, batch_info, dst_curs):
        """Called on first event for this table in current batch."""
        if self.conf.table_mode != 'ignore':
            self.batch_info = batch_info
            self.dst_curs = dst_curs
        #BaseHandler.prepare_batch(self, batch_info, dst_curs)

    def filter_data(self, data):
        """Process with fields skip and map"""
        fskip = self.conf.skip_fields
        fmap = self.conf.field_map
        if fskip:
            data = dict((k, v) for k, v in data.items()
                    if k not in fskip)
        if fmap:
            # when field name not present in source is used then  None (NULL)
            # value is inserted. is it ok?
            data = dict( (v, data.get(k)) for k, v in fmap.items())
        return data

    def filter_pkeys(self, pkeys):
        """Process with fields skip and map"""
        fskip = self.conf.skip_fields
        fmap = self.conf.field_map
        if fskip:
            pkeys = [f for f in pkeys if f not in fskip]
        if fmap:
            pkeys = [fmap[p] for p in pkeys if p in fmap]
        return pkeys

    def process_event(self, ev, sql_queue_func, arg):
        """Process a event.
        Event should be added to sql_queue or executed directly.
        """
        if self.conf.table_mode == 'ignore':
            return
        # get data
        data = skytools.db_urldecode(ev.data)
        if self.encoding_validator:
            data = self.encoding_validator.validate_dict(data, self.table_name)
        if len(ev.ev_type) < 2 or ev.ev_type[1] != ':':
            raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % (
                            ev.ev_type, ev.ev_extra1, ev.ev_data))
        op, pkeys = ev.type.split(':', 1)
        if op not in 'IUD':
            raise Exception('Unknown event type: %s' % ev.ev_type)
        # process only operations specified
        if not op in self.conf.event_types:
            return
        self.log.debug('dispatch.process_event: %s/%s' % (
            ev.ev_type, ev.ev_data))
        if self.pkeys is None:
            self.pkeys = self.filter_pkeys(pkeys.split(','))
        data = self.filter_data(data)
        # prepare split table when needed
        if self.conf.table_mode == 'part':
            dst, part_time = self.split_format(ev, data)
            if dst not in self.row_handler.table_map:
                self.check_part(dst, part_time)
        else:
            dst = self.dest_table

        if dst not in self.row_handler.table_map:
            self.row_handler.add_table(dst, LOADERS[self.conf.load_mode],
                                    self.pkeys, self.conf)
        self.row_handler.process(dst, op, data)
        #BaseHandler.process_event(self, ev, sql_queue_func, arg)

    def finish_batch(self, batch_info, dst_curs):
        """Called when batch finishes."""
        if self.conf.table_mode != 'ignore':
            self.row_handler.flush(dst_curs)
        #BaseHandler.finish_batch(self, batch_info, dst_curs)

    def get_part_name(self):
        # if custom part name template given, use it
        if self.conf.part_name:
            return self.conf.part_name
        parts = ['year', 'month', 'day', 'hour']
        name_parts = ['parent'] + parts[:parts.index(self.conf.period)+1]
        return '_'.join('%%(%s)s' % part for part in name_parts)

    def split_format(self, ev, data):
        """Generates part table name from template"""
        if self.conf.part_mode == 'batch_time':
            dtm = self.batch_info['batch_end']
        elif self.conf.part_mode == 'event_time':
            dtm = ev.ev_time
        elif self.conf.part_mode == 'current_time':
            dtm = datetime.datetime.now()
        elif self.conf.part_mode == 'date_field':
            dt_str = data[self.conf.part_field]
            if dt_str is None:
                raise Exception('part_field(%s) is NULL: %s' % (self.conf.part_field, ev))
            dtm = datetime.datetime.strptime(dt_str[:19], "%Y-%m-%d %H:%M:%S")
        else:
            raise UsageError('Bad value for part_mode: %s' %\
                    self.conf.part_mode)
        vals = {'parent': self.dest_table,
                'year': "%04d" % dtm.year,
                'month': "%02d" % dtm.month,
                'day': "%02d" % dtm.day,
                'hour': "%02d" % dtm.hour,
               }
        return (self.get_part_name() % vals, dtm)

    def check_part(self, dst, part_time):
        """Create part table if not exists.

        It part_template present, execute it
        else if part function present in db, call it
        else clone master table"""
        curs = self.dst_curs
        if skytools.exists_table(curs, dst):
            return
        dst = quote_fqident(dst)
        vals = {'dest': dst,
                'part': dst,
                'parent': self.fq_dest_table,
                'pkeys': ",".join(self.pkeys), # quoting?
                # we do this to make sure that constraints for
                # tables who contain a schema will still work
                'schema_table': dst.replace(".", "__"),
                'part_field': self.conf.part_field,
                'part_time': part_time,
                'period': self.conf.period,
                }
        def exec_with_vals(tmpl):
            if tmpl:
                sql = tmpl % vals
                curs.execute(sql)
                return True
            return False

        exec_with_vals(self.conf.pre_part)

        if not exec_with_vals(self.conf.part_template):
            self.log.debug('part_template not provided, using part func')
            # if part func exists call it with val arguments
            pfargs = ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS)

            # set up configured function
            pfcall = 'select %s(%s)' % (self.conf.part_func, pfargs)
            have_func = skytools.exists_function(curs, self.conf.part_func, len(PART_FUNC_ARGS))

            # backwards compat
            if not have_func and self.conf.part_func == PART_FUNC_NEW:
                pfcall = 'select %s(%s)' % (PART_FUNC_OLD, pfargs)
                have_func = skytools.exists_function(curs, PART_FUNC_OLD, len(PART_FUNC_ARGS))

            if have_func:
                self.log.debug('check_part.exec: func:%s, args: %s' % (pfcall, vals))
                curs.execute(pfcall, vals)
            else:
                #
                # Otherwise create simple clone.
                #
                # FixMe: differences from create_partitions():
                # - check constraints
                # - inheritance
                #
                self.log.debug('part func %s not found, cloning table' % self.conf.part_func)
                struct = TableStruct(curs, self.dest_table)
                struct.create(curs, T_ALL, dst)

        exec_with_vals(self.conf.post_part)
        self.log.info("Created table: %s" % dst)

    def real_copy(self, tablename, src_curs, dst_curs, column_list):
        """do actual table copy and return tuple with number of bytes and rows
        copied
        """
        _src_cols = _dst_cols = column_list
        condition = ''

        if self.conf.skip_fields:
            _src_cols = [col for col in column_list
                         if col not in self.conf.skip_fields]
            _dst_cols = _src_cols

        if self.conf.field_map:
            _src_cols = [col for col in _src_cols if col in self.conf.field_map]
            _dst_cols = [self.conf.field_map[col] for col in _src_cols]

        if self.encoding_validator:
            def _write_hook(obj, data):
                return self.encoding_validator.validate_copy(data, _src_cols, tablename)
        else:
            _write_hook = None

        return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, condition,
                                  dst_tablename = self.dest_table,
                                  dst_column_list = _dst_cols,
                                  write_hook = _write_hook)