def __init__(self, table_name, args, dest_table): # compat for dest-table dest_table = args.get('table', dest_table) BaseHandler.__init__(self, table_name, args, dest_table) # show args self.log.debug("dispatch.init: table_name=%r, args=%r" % \ (table_name, args)) self.batch_info = None self.dst_curs = None self.pkeys = None # config self.conf = self.get_config() hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) if self.conf.encoding: self.encoding_validator = EncodingValidator( self.log, self.conf.encoding) else: self.encoding_validator = None
def __init__(self, table_name, args, dest_table): # compat for dest-table dest_table = args.get('table', dest_table) BaseHandler.__init__(self, table_name, args, dest_table) # show args self.log.debug("dispatch.init: table_name=%r, args=%r", table_name, args) self.batch_info = None self.dst_curs = None self.pkeys = None # config self.conf = self.get_config() hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) if self.conf.encoding: self.encoding_validator = EncodingValidator(self.log, self.conf.encoding) else: self.encoding_validator = None
class Dispatcher(BaseHandler): """Partitioned loader. Splits events into partitions, if requested. Then applies them without further processing. """ handler_name = 'dispatch' def __init__(self, table_name, args, dest_table): # compat for dest-table dest_table = args.get('table', dest_table) BaseHandler.__init__(self, table_name, args, dest_table) # show args self.log.debug("dispatch.init: table_name=%r, args=%r" % \ (table_name, args)) self.batch_info = None self.dst_curs = None self.pkeys = None # config self.conf = self.get_config() hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) if self.conf.encoding: self.encoding_validator = EncodingValidator( self.log, self.conf.encoding) else: self.encoding_validator = None def get_config(self): """Processes args dict""" conf = skytools.dbdict() # set table mode conf.table_mode = self.get_arg('table_mode', TABLE_MODES) conf.analyze = self.get_arg('analyze', [0, 1]) if conf.table_mode == 'part': conf.part_mode = self.get_arg('part_mode', PART_MODES) conf.part_field = self.args.get('part_field') if conf.part_mode == 'date_field' and not conf.part_field: raise Exception('part_mode date_field requires part_field!') conf.period = self.get_arg('period', PERIODS) conf.part_name = self.args.get('part_name') conf.part_template = self.args.get('part_template') conf.pre_part = self.args.get('pre_part') conf.post_part = self.args.get('post_part') conf.part_func = self.args.get('part_func', PART_FUNC_NEW) # set row mode and event types to process conf.row_mode = self.get_arg('row_mode', ROW_MODES) event_types = self.args.get('event_types', '*') if event_types == '*': event_types = EVENT_TYPES else: event_types = [evt.upper() for evt in event_types.split(',')] for evt in event_types: if evt not in EVENT_TYPES: raise Exception('Unsupported operation: %s' % evt) conf.event_types = event_types # set load handler conf.load_mode = self.get_arg('load_mode', LOAD_MODES) conf.method = self.get_arg('method', METHODS) # fields to skip conf.skip_fields = [ f.strip().lower() for f in self.args.get('skip_fields', '').split(',') ] # get fields map (obsolete, for compatibility reasons) fields = self.args.get('fields', '*') if fields == "*": conf.field_map = None else: conf.field_map = {} for fval in fields.split(','): tmp = fval.split(':') if len(tmp) == 1: conf.field_map[tmp[0]] = tmp[0] else: conf.field_map[tmp[0]] = tmp[1] # encoding validator conf.encoding = self.args.get('encoding') return conf def get_arg(self, name, value_list, default=None): default = default or value_list[0] val = type(default)(self.args.get(name, default)) if val not in value_list: raise Exception('Bad argument %s value %r' % (name, val)) return val def reset(self): """Called before starting to process a batch. Should clean any pending data.""" BaseHandler.reset(self) def prepare_batch(self, batch_info, dst_curs): """Called on first event for this table in current batch.""" if self.conf.table_mode != 'ignore': self.batch_info = batch_info self.dst_curs = dst_curs #BaseHandler.prepare_batch(self, batch_info, dst_curs) def filter_data(self, data): """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: data = dict((k, v) for k, v in data.items() if k not in fskip) if fmap: # when field name not present in source is used then None (NULL) # value is inserted. is it ok? data = dict((v, data.get(k)) for k, v in fmap.items()) return data def filter_pkeys(self, pkeys): """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: pkeys = [f for f in pkeys if f not in fskip] if fmap: pkeys = [fmap[p] for p in pkeys if p in fmap] return pkeys def process_event(self, ev, sql_queue_func, arg): """Process a event. Event should be added to sql_queue or executed directly. """ if self.conf.table_mode == 'ignore': return # get data data = skytools.db_urldecode(ev.data) if self.encoding_validator: data = self.encoding_validator.validate_dict(data, self.table_name) if len(ev.ev_type) < 2 or ev.ev_type[1] != ':': raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % (ev.ev_type, ev.ev_extra1, ev.ev_data)) op, pkeys = ev.type.split(':', 1) if op not in 'IUD': raise Exception('Unknown event type: %s' % ev.ev_type) # process only operations specified if not op in self.conf.event_types: return self.log.debug('dispatch.process_event: %s/%s' % (ev.ev_type, ev.ev_data)) if self.pkeys is None: self.pkeys = self.filter_pkeys(pkeys.split(',')) data = self.filter_data(data) # prepare split table when needed if self.conf.table_mode == 'part': dst, part_time = self.split_format(ev, data) if dst not in self.row_handler.table_map: self.check_part(dst, part_time) else: dst = self.dest_table if dst not in self.row_handler.table_map: self.row_handler.add_table(dst, LOADERS[self.conf.load_mode], self.pkeys, self.conf) self.row_handler.process(dst, op, data) #BaseHandler.process_event(self, ev, sql_queue_func, arg) def finish_batch(self, batch_info, dst_curs): """Called when batch finishes.""" if self.conf.table_mode != 'ignore': self.row_handler.flush(dst_curs) #BaseHandler.finish_batch(self, batch_info, dst_curs) def get_part_name(self): # if custom part name template given, use it if self.conf.part_name: return self.conf.part_name parts = ['year', 'month', 'day', 'hour'] name_parts = ['parent'] + parts[:parts.index(self.conf.period) + 1] return '_'.join('%%(%s)s' % part for part in name_parts) def split_format(self, ev, data): """Generates part table name from template""" if self.conf.part_mode == 'batch_time': dtm = self.batch_info['batch_end'] elif self.conf.part_mode == 'event_time': dtm = ev.ev_time elif self.conf.part_mode == 'current_time': dtm = datetime.datetime.now() elif self.conf.part_mode == 'date_field': dt_str = data[self.conf.part_field] if dt_str is None: raise Exception('part_field(%s) is NULL: %s' % (self.conf.part_field, ev)) dtm = datetime.datetime.strptime(dt_str[:19], "%Y-%m-%d %H:%M:%S") else: raise UsageError('Bad value for part_mode: %s' %\ self.conf.part_mode) vals = { 'parent': self.dest_table, 'year': "%04d" % dtm.year, 'month': "%02d" % dtm.month, 'day': "%02d" % dtm.day, 'hour': "%02d" % dtm.hour, } return (self.get_part_name() % vals, dtm) def check_part(self, dst, part_time): """Create part table if not exists. It part_template present, execute it else if part function present in db, call it else clone master table""" curs = self.dst_curs if skytools.exists_table(curs, dst): return dst = quote_fqident(dst) vals = { 'dest': dst, 'part': dst, 'parent': self.fq_dest_table, 'pkeys': ",".join(self.pkeys), # quoting? # we do this to make sure that constraints for # tables who contain a schema will still work 'schema_table': dst.replace(".", "__"), 'part_field': self.conf.part_field, 'part_time': part_time, 'period': self.conf.period, } def exec_with_vals(tmpl): if tmpl: sql = tmpl % vals curs.execute(sql) return True return False exec_with_vals(self.conf.pre_part) if not exec_with_vals(self.conf.part_template): self.log.debug('part_template not provided, using part func') # if part func exists call it with val arguments pfargs = ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS) # set up configured function pfcall = 'select %s(%s)' % (self.conf.part_func, pfargs) have_func = skytools.exists_function(curs, self.conf.part_func, len(PART_FUNC_ARGS)) # backwards compat if not have_func and self.conf.part_func == PART_FUNC_NEW: pfcall = 'select %s(%s)' % (PART_FUNC_OLD, pfargs) have_func = skytools.exists_function(curs, PART_FUNC_OLD, len(PART_FUNC_ARGS)) if have_func: self.log.debug('check_part.exec: func:%s, args: %s' % (pfcall, vals)) curs.execute(pfcall, vals) else: # # Otherwise crete simple clone. # # FixMe: differences from create_partitions(): # - check constraints # - inheritance # self.log.debug('part func %s not found, cloning table' % self.conf.part_func) struct = TableStruct(curs, self.dest_table) struct.create(curs, T_ALL, dst) exec_with_vals(self.conf.post_part) self.log.info("Created table: %s" % dst) def real_copy(self, tablename, src_curs, dst_curs, column_list): """do actual table copy and return tuple with number of bytes and rows copyed """ _src_cols = _dst_cols = column_list condition = '' if self.conf.skip_fields: _src_cols = [ col for col in column_list if col not in self.conf.skip_fields ] _dst_cols = _src_cols if self.conf.field_map: _src_cols = [ col for col in _src_cols if col in self.conf.field_map ] _dst_cols = [self.conf.field_map[col] for col in _src_cols] if self.encoding_validator: def _write_hook(obj, data): return self.encoding_validator.validate_copy( data, _src_cols, tablename) else: _write_hook = None return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, condition, dst_tablename=self.dest_table, dst_column_list=_dst_cols, write_hook=_write_hook)
class Dispatcher(BaseHandler): """Partitioned loader. Splits events into partitions, if requested. Then applies them without further processing. """ handler_name = 'dispatch' def __init__(self, table_name, args, dest_table): # compat for dest-table dest_table = args.get('table', dest_table) BaseHandler.__init__(self, table_name, args, dest_table) # show args self.log.debug("dispatch.init: table_name=%r, args=%r" % \ (table_name, args)) self.batch_info = None self.dst_curs = None self.pkeys = None # config self.conf = self.get_config() hdlr_cls = ROW_HANDLERS[self.conf.row_mode] self.row_handler = hdlr_cls(self.log) if self.conf.encoding: self.encoding_validator = EncodingValidator(self.log, self.conf.encoding) else: self.encoding_validator = None def _parse_args_from_doc (self): doc = __doc__ params_descr = [] params_found = False for line in doc.splitlines(): ln = line.strip() if params_found: if ln.startswith("=="): break m = re.match ("^(\w+):$", ln) if m: name = m.group(1) expr = text = "" elif not params_descr: continue else: name, expr, text = params_descr.pop() text += ln + "\n" params_descr.append ((name, expr, text)) elif ln == "== HANDLER ARGUMENTS ==": params_found = True return params_descr def get_config(self): """Processes args dict""" conf = skytools.dbdict() # set table mode conf.table_mode = self.get_arg('table_mode', TABLE_MODES) conf.analyze = self.get_arg('analyze', [0, 1]) if conf.table_mode == 'part': conf.part_mode = self.get_arg('part_mode', PART_MODES) conf.part_field = self.args.get('part_field') if conf.part_mode == 'date_field' and not conf.part_field : raise Exception('part_mode date_field requires part_field!') conf.period = self.get_arg('period', PERIODS) conf.part_name = self.args.get('part_name') conf.part_template = self.args.get('part_template') conf.pre_part = self.args.get('pre_part') conf.post_part = self.args.get('post_part') conf.part_func = self.args.get('part_func', PART_FUNC_NEW) # set row mode and event types to process conf.row_mode = self.get_arg('row_mode', ROW_MODES) event_types = self.args.get('event_types', '*') if event_types == '*': event_types = EVENT_TYPES else: event_types = [evt.upper() for evt in event_types.split(',')] for evt in event_types: if evt not in EVENT_TYPES: raise Exception('Unsupported operation: %s' % evt) conf.event_types = event_types # set load handler conf.load_mode = self.get_arg('load_mode', LOAD_MODES) conf.method = self.get_arg('method', METHODS) # fields to skip conf.skip_fields = [f.strip().lower() for f in self.args.get('skip_fields','').split(',')] # get fields map (obsolete, for compatibility reasons) fields = self.args.get('fields', '*') if fields == "*": conf.field_map = None else: conf.field_map = {} for fval in fields.split(','): tmp = fval.split(':') if len(tmp) == 1: conf.field_map[tmp[0]] = tmp[0] else: conf.field_map[tmp[0]] = tmp[1] # encoding validator conf.encoding = self.args.get('encoding') return conf def get_arg(self, name, value_list, default = None): default = default or value_list[0] val = type(default)(self.args.get(name, default)) if val not in value_list: raise Exception('Bad argument %s value %r' % (name, val)) return val def reset(self): """Called before starting to process a batch. Should clean any pending data.""" BaseHandler.reset(self) def prepare_batch(self, batch_info, dst_curs): """Called on first event for this table in current batch.""" if self.conf.table_mode != 'ignore': self.batch_info = batch_info self.dst_curs = dst_curs #BaseHandler.prepare_batch(self, batch_info, dst_curs) def filter_data(self, data): """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: data = dict((k, v) for k, v in data.items() if k not in fskip) if fmap: # when field name not present in source is used then None (NULL) # value is inserted. is it ok? data = dict( (v, data.get(k)) for k, v in fmap.items()) return data def filter_pkeys(self, pkeys): """Process with fields skip and map""" fskip = self.conf.skip_fields fmap = self.conf.field_map if fskip: pkeys = [f for f in pkeys if f not in fskip] if fmap: pkeys = [fmap[p] for p in pkeys if p in fmap] return pkeys def process_event(self, ev, sql_queue_func, arg): """Process a event. Event should be added to sql_queue or executed directly. """ if self.conf.table_mode == 'ignore': return # get data data = skytools.db_urldecode(ev.data) if self.encoding_validator: data = self.encoding_validator.validate_dict(data, self.table_name) if len(ev.ev_type) < 2 or ev.ev_type[1] != ':': raise Exception('Unsupported event type: %s/extra1=%s/data=%s' % ( ev.ev_type, ev.ev_extra1, ev.ev_data)) op, pkeys = ev.type.split(':', 1) if op not in 'IUD': raise Exception('Unknown event type: %s' % ev.ev_type) # process only operations specified if not op in self.conf.event_types: return self.log.debug('dispatch.process_event: %s/%s' % ( ev.ev_type, ev.ev_data)) if self.pkeys is None: self.pkeys = self.filter_pkeys(pkeys.split(',')) data = self.filter_data(data) # prepare split table when needed if self.conf.table_mode == 'part': dst, part_time = self.split_format(ev, data) if dst not in self.row_handler.table_map: self.check_part(dst, part_time) else: dst = self.dest_table if dst not in self.row_handler.table_map: self.row_handler.add_table(dst, LOADERS[self.conf.load_mode], self.pkeys, self.conf) self.row_handler.process(dst, op, data) #BaseHandler.process_event(self, ev, sql_queue_func, arg) def finish_batch(self, batch_info, dst_curs): """Called when batch finishes.""" if self.conf.table_mode != 'ignore': self.row_handler.flush(dst_curs) #BaseHandler.finish_batch(self, batch_info, dst_curs) def get_part_name(self): # if custom part name template given, use it if self.conf.part_name: return self.conf.part_name parts = ['year', 'month', 'day', 'hour'] name_parts = ['parent'] + parts[:parts.index(self.conf.period)+1] return '_'.join('%%(%s)s' % part for part in name_parts) def split_format(self, ev, data): """Generates part table name from template""" if self.conf.part_mode == 'batch_time': dtm = self.batch_info['batch_end'] elif self.conf.part_mode == 'event_time': dtm = ev.ev_time elif self.conf.part_mode == 'current_time': dtm = datetime.datetime.now() elif self.conf.part_mode == 'date_field': dt_str = data[self.conf.part_field] if dt_str is None: raise Exception('part_field(%s) is NULL: %s' % (self.conf.part_field, ev)) dtm = datetime.datetime.strptime(dt_str[:19], "%Y-%m-%d %H:%M:%S") else: raise UsageError('Bad value for part_mode: %s' %\ self.conf.part_mode) vals = {'parent': self.dest_table, 'year': "%04d" % dtm.year, 'month': "%02d" % dtm.month, 'day': "%02d" % dtm.day, 'hour': "%02d" % dtm.hour, } return (self.get_part_name() % vals, dtm) def check_part(self, dst, part_time): """Create part table if not exists. It part_template present, execute it else if part function present in db, call it else clone master table""" curs = self.dst_curs if skytools.exists_table(curs, dst): return dst = quote_fqident(dst) vals = {'dest': dst, 'part': dst, 'parent': self.fq_dest_table, 'pkeys': ",".join(self.pkeys), # quoting? # we do this to make sure that constraints for # tables who contain a schema will still work 'schema_table': dst.replace(".", "__"), 'part_field': self.conf.part_field, 'part_time': part_time, 'period': self.conf.period, } def exec_with_vals(tmpl): if tmpl: sql = tmpl % vals curs.execute(sql) return True return False exec_with_vals(self.conf.pre_part) if not exec_with_vals(self.conf.part_template): self.log.debug('part_template not provided, using part func') # if part func exists call it with val arguments pfargs = ', '.join('%%(%s)s' % arg for arg in PART_FUNC_ARGS) # set up configured function pfcall = 'select %s(%s)' % (self.conf.part_func, pfargs) have_func = skytools.exists_function(curs, self.conf.part_func, len(PART_FUNC_ARGS)) # backwards compat if not have_func and self.conf.part_func == PART_FUNC_NEW: pfcall = 'select %s(%s)' % (PART_FUNC_OLD, pfargs) have_func = skytools.exists_function(curs, PART_FUNC_OLD, len(PART_FUNC_ARGS)) if have_func: self.log.debug('check_part.exec: func:%s, args: %s' % (pfcall, vals)) curs.execute(pfcall, vals) else: # # Otherwise create simple clone. # # FixMe: differences from create_partitions(): # - check constraints # - inheritance # self.log.debug('part func %s not found, cloning table' % self.conf.part_func) struct = TableStruct(curs, self.dest_table) struct.create(curs, T_ALL, dst) exec_with_vals(self.conf.post_part) self.log.info("Created table: %s" % dst) def real_copy(self, tablename, src_curs, dst_curs, column_list): """do actual table copy and return tuple with number of bytes and rows copied """ _src_cols = _dst_cols = column_list condition = '' if self.conf.skip_fields: _src_cols = [col for col in column_list if col not in self.conf.skip_fields] _dst_cols = _src_cols if self.conf.field_map: _src_cols = [col for col in _src_cols if col in self.conf.field_map] _dst_cols = [self.conf.field_map[col] for col in _src_cols] if self.encoding_validator: def _write_hook(obj, data): return self.encoding_validator.validate_copy(data, _src_cols, tablename) else: _write_hook = None return skytools.full_copy(tablename, src_curs, dst_curs, _src_cols, condition, dst_tablename = self.dest_table, dst_column_list = _dst_cols, write_hook = _write_hook)