def register(target_model, options): logging.info('[denorm.register] %s' % target_model) if not hasattr(target_model, '_meta'): raise AttributeError('The model being registered must derive from Model.') target = util.get_model_name(target_model) target_options = target_model._meta # register signals for target. use dispatch_uid to prevent duplicates. # about signals: https://docs.djangoproject.com/en/1.8/topics/signals/ # built-in signals: https://docs.djangoproject.com/en/1.8/ref/signals/ db_signals.post_init.connect(receivers.target_model_post_init, sender=target_model, dispatch_uid='denorm_target_%s_post_init'%target) db_signals.pre_save.connect(receivers.target_model_pre_save, sender=target_model, dispatch_uid='denorm_target_%s_pre_save'%target) db_signals.post_save.connect(receivers.target_model_post_save, sender=target_model, dispatch_uid='denorm_target_%s_post_save'%target) target_graph = core.TARGET_GRAPH[target_model] = core.TARGET_GRAPH.get(target_model, {}) for source, source_dict in options['sources'].iteritems(): strategy = source_dict.get('strategy', 'cursor') # options are: [cursor, mapreduce]. defaults to cursor. # TODO: support storage options 'list' and 'dict' storage = source_dict.get('storage', 'scalar') # choices: [scalar, shared_dict] if storage == 'scalar': target_foreign_key = target_options.get_field(source) # if field did not exist, then get_field would have raised FieldDoesNotExist if not isinstance(target_foreign_key, ForeignKey): raise AttributeError('The source field %s.%s must be a ForeignKey' % (target, source)) source_model = target_foreign_key.rel.to elif storage == 'shared_dict': target_foreign_key_list = target_options.get_field(Inflector().pluralize(source)) # if field did not exist, then get_field would have raised FieldDoesNotExist if not isinstance(target_foreign_key_list, tb_fields.ListField): raise AttributeError('The target field %s.%s must be a ListField' % (target, source)) # model must be explicitly configured, because target field does not specify it source_model = source_dict.get('model') # create denorm data field try: target_options.get_field('denorm_data') except FieldDoesNotExist: # field should not exist. now let's create it. denorm_data_field = JSONField(name='denorm_data', null=True, blank=True, decoder_kwargs={'cls': json_fields.JSONDecoder, 'parse_float':float}) denorm_data_field.contribute_to_class(target_model, 'denorm_data') else: # field was already created on prior source field # TODO: do at beginning of target model configuration to make sure developer did not define it pass else: logging.error('[denorm.register] invalid storage option %s' % storage) source_options = source_model._meta # register signals for source. use dispatch_uid to prevent duplicates. db_signals.post_init.connect(receivers.source_model_post_init, sender=source_model, dispatch_uid='denorm_source_%s_post_init'%source) db_signals.pre_save.connect(receivers.source_model_pre_save, sender=source_model, dispatch_uid='denorm_source_%s_pre_save'%source) db_signals.post_save.connect(receivers.source_model_post_save, sender=source_model, dispatch_uid='denorm_source_%s_post_save'%source) # FIXME: it's quirky that label and throttles must be configured under each target-source in app's denorm_fields, # FIXME: but it gets applied here for entire source (not target dependent). it probably should be configured once # FIXME: per source, but how do accomplish that in the current configuration design? source_graph = core.SOURCE_GRAPH[source_model] = core.SOURCE_GRAPH.get(source_model, { 'label': source_dict.get('label'), 'throttles': source_dict.get('throttles'), 'fields': {} }) source_graph_fields = source_graph['fields'] # mark model as registered for denormalization source_model._denorm_registered = True # clone list, so that if we add _id below, it doesn't corrupt original list denorm_field_names = list(source_dict['fields']) target_graph[source] = { 'fields': denorm_field_names, 'storage': storage, 'source_model': source_model # important for shared_dict storage, because we don't know source model based on list field } for i, denorm_field_name in enumerate(denorm_field_names): source_field = source_options.get_field(denorm_field_name) # if field did not exist, then get_field would have raised FieldDoesNotExist target_field_name = '%s_%s' % (source, denorm_field_name) if storage == 'scalar': try: target_options.get_field(target_field_name) except FieldDoesNotExist: # field should not exist, so we're good pass else: raise AttributeError('The denorm field %s.%s must not already exist' % (target_model.__name__, target_field_name)) # create target field of same type as source_field target_field = _copy_field(source_field, target_field_name, target) target_field.contribute_to_class(target_model, target_field_name) #print('added field %s with name %s, column %s' % (target_field, target_field_name, target_field.column)) #print('added field %s with name %s' % (target_model._meta.get_field(target_field_name), target_field_name)) else: assert(storage == 'shared_dict') # denorm_data field was already created outside this iteration loop pass # if source field is a foreign key, then we reference its key rather than the actual related field, # because we are not deferencing further than the key, and do not want to do an extra db lookup. if isinstance(source_field, ForeignKey): denorm_field_name += '_id' denorm_field_names[i] = denorm_field_name source_field_graph = source_graph_fields[denorm_field_name] = source_graph_fields.get(denorm_field_name, []) source_field_graph.append({ 'target_model': target_model, 'source': source, 'strategy': strategy, 'storage': storage, 'shards': source_dict.get('shards') and util.convert_func_to_string(source_dict['shards']) })
def source_model_post_save(sender, instance, created, **kwargs): # for clarity source_model = sender source_instance = instance affected_targets = source_instance._denorm_affected_targets if not affected_targets: # nothing to denorm return # # create a task for each affected target to update its instances # for target_model, affected_target in affected_targets.iteritems(): # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field related_field_name = affected_target['related'] strategy = affected_target['strategy'] storage = affected_target['storage'] shards = affected_target['shards'] affected_fields = affected_target['fields'] #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields)) # for each affected target, create a separate task instance_id = source_instance.id tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name( source_model), instance_id, util.get_model_name(target_model)) payload = { 'created': timezone.now().isoformat(), 'strategy': strategy, 'storage': storage, 'instance_id': instance_id, 'source_model': util.get_model_name(source_model), 'target_model': util.get_model_name(target_model), 'related_field': related_field_name, 'fields': affected_fields, # TODO: queue name should be configurable 'queue_name': 'denorm' } if strategy == 'mapreduce': payload['shards'] = handler_for_name(shards)( source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS payload_string = util.dump_json(payload) logging.info( '[denorm source_model_post_save] queue task payload = %s' % payload_string) # create a pull task per target taskqueue.Queue('pull-denorm').add( taskqueue.Task(payload=payload_string, tag=tag, method='PULL')) # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling models.get_task_model().objects.create( source_model=util.get_model_name(source_model), source_instance_id=source_instance.id, user=source_instance._denorm_user, label=source_instance._denorm_label) # re-run post_init to reset _denorm_orig_values in case this instance gets saved again source_model_post_init(source_model, source_instance)
def source_model_pre_save(sender, instance, raw, using, update_fields, **kwargs): # for clarity source_model = sender source_instance = instance created = not source_instance.id source_instance._denorm_affected_targets = affected_targets = {} # newly created instances will not need denormalization if created: return # denorm turned off if not getattr(source_instance, '_denorm', True): return source_graph = core.SOURCE_GRAPH[source_model] # # iterate through all fields to build up set of distinct affected targets that post_save signal receiver will process. # source_graph_fields = source_graph['fields'] orig_values = source_instance._denorm_orig_values for source_field, targets in source_graph_fields.iteritems(): old_value = orig_values[source_field] new_value = getattr(source_instance, source_field) if old_value != new_value: #logging.info('[%s] %s value changed from "%s" to "%s"' % (source_model, source_field, old_value, new_value)) for target in targets: target_model = target['target_model'] related_field_name = target['source'] storage = target['storage'] affected_targets[ target_model] = affected_target = affected_targets.get( target_model, { 'related': related_field_name, 'strategy': target['strategy'], 'storage': storage, 'shards': target['shards'], 'fields': {} }) # when task will update target, if storage is scalar, then field name is simply target model field name. # and if storage is shared_dict, then the field name is the dictionary key of the target model's denorm_data field. affected_fields_for_target = affected_target['fields'] affected_fields_for_target['%s_%s' % (related_field_name, source_field)] = new_value if not affected_targets: return # # check that denorm throttling threshold is not exceeded # # get user from thread-local variable set by middleware user = middleware.get_current_user() if not user or not isinstance(user, get_user_model( )) or not user.is_authenticated() or user.is_superuser: user = None source_instance._denorm_user = user # get denorm label used for throttling if 'label' in source_graph: # custom label set by application label = source_graph['label'](source_instance, user) else: # default label if user: label = '%s_%s' % (util.get_model_name(source_model), str(user.id)) else: # no label label = None source_instance._denorm_label = label throttles = source_graph.get('throttles') if not label or not throttles: # no throttling return # now validate each throttle # FIXME: we need to figure out if there is already a denorm task scheduled, and if so, then don't penalize throttle. # FIXME: perhaps we can use a Task.status field in combination with filter for source instance id. now = timezone.now() for throttle in throttles: num_requests, duration = util.parse_rate(throttle) # FIXME: this is a naive, inefficient implementation. we should cache task counts. if models.get_task_model().objects.filter( label=label, created__gt=now - timedelta(seconds=duration)).count() >= num_requests: raise exceptions.DenormThrottled
def register(target_model, options): logging.info('[denorm.register] %s' % target_model) if not hasattr(target_model, '_meta'): raise AttributeError( 'The model being registered must derive from Model.') target = util.get_model_name(target_model) target_options = target_model._meta # register signals for target. use dispatch_uid to prevent duplicates. # about signals: https://docs.djangoproject.com/en/1.8/topics/signals/ # built-in signals: https://docs.djangoproject.com/en/1.8/ref/signals/ db_signals.post_init.connect(receivers.target_model_post_init, sender=target_model, dispatch_uid='denorm_target_%s_post_init' % target) db_signals.pre_save.connect(receivers.target_model_pre_save, sender=target_model, dispatch_uid='denorm_target_%s_pre_save' % target) db_signals.post_save.connect(receivers.target_model_post_save, sender=target_model, dispatch_uid='denorm_target_%s_post_save' % target) target_graph = core.TARGET_GRAPH[target_model] = core.TARGET_GRAPH.get( target_model, {}) for source, source_dict in options['sources'].iteritems(): strategy = source_dict.get( 'strategy', 'cursor') # options are: [cursor, mapreduce]. defaults to cursor. # TODO: support storage options 'list' and 'dict' storage = source_dict.get('storage', 'scalar') # choices: [scalar, shared_dict] if storage == 'scalar': target_foreign_key = target_options.get_field(source) # if field did not exist, then get_field would have raised FieldDoesNotExist if not isinstance(target_foreign_key, ForeignKey): raise AttributeError( 'The source field %s.%s must be a ForeignKey' % (target, source)) source_model = target_foreign_key.rel.to elif storage == 'shared_dict': target_foreign_key_list = target_options.get_field( Inflector().pluralize(source)) # if field did not exist, then get_field would have raised FieldDoesNotExist if not isinstance(target_foreign_key_list, tb_fields.ListField): raise AttributeError( 'The target field %s.%s must be a ListField' % (target, source)) # model must be explicitly configured, because target field does not specify it source_model = source_dict.get('model') # create denorm data field try: target_options.get_field('denorm_data') except FieldDoesNotExist: # field should not exist. now let's create it. denorm_data_field = JSONField(name='denorm_data', null=True, blank=True, decoder_kwargs={ 'cls': json_fields.JSONDecoder, 'parse_float': float }) denorm_data_field.contribute_to_class(target_model, 'denorm_data') else: # field was already created on prior source field # TODO: do at beginning of target model configuration to make sure developer did not define it pass else: logging.error('[denorm.register] invalid storage option %s' % storage) source_options = source_model._meta # register signals for source. use dispatch_uid to prevent duplicates. db_signals.post_init.connect( receivers.source_model_post_init, sender=source_model, dispatch_uid='denorm_source_%s_post_init' % source) db_signals.pre_save.connect(receivers.source_model_pre_save, sender=source_model, dispatch_uid='denorm_source_%s_pre_save' % source) db_signals.post_save.connect( receivers.source_model_post_save, sender=source_model, dispatch_uid='denorm_source_%s_post_save' % source) # FIXME: it's quirky that label and throttles must be configured under each target-source in app's denorm_fields, # FIXME: but it gets applied here for entire source (not target dependent). it probably should be configured once # FIXME: per source, but how do accomplish that in the current configuration design? source_graph = core.SOURCE_GRAPH[source_model] = core.SOURCE_GRAPH.get( source_model, { 'label': source_dict.get('label'), 'throttles': source_dict.get('throttles'), 'fields': {} }) source_graph_fields = source_graph['fields'] # mark model as registered for denormalization source_model._denorm_registered = True # clone list, so that if we add _id below, it doesn't corrupt original list denorm_field_names = list(source_dict['fields']) target_graph[source] = { 'fields': denorm_field_names, 'storage': storage, 'source_model': source_model # important for shared_dict storage, because we don't know source model based on list field } for i, denorm_field_name in enumerate(denorm_field_names): source_field = source_options.get_field(denorm_field_name) # if field did not exist, then get_field would have raised FieldDoesNotExist target_field_name = '%s_%s' % (source, denorm_field_name) if storage == 'scalar': try: target_options.get_field(target_field_name) except FieldDoesNotExist: # field should not exist, so we're good pass else: raise AttributeError( 'The denorm field %s.%s must not already exist' % (target_model.__name__, target_field_name)) # create target field of same type as source_field target_field = _copy_field(source_field, target_field_name, target) target_field.contribute_to_class(target_model, target_field_name) #print('added field %s with name %s, column %s' % (target_field, target_field_name, target_field.column)) #print('added field %s with name %s' % (target_model._meta.get_field(target_field_name), target_field_name)) else: assert (storage == 'shared_dict') # denorm_data field was already created outside this iteration loop pass # if source field is a foreign key, then we reference its key rather than the actual related field, # because we are not deferencing further than the key, and do not want to do an extra db lookup. if isinstance(source_field, ForeignKey): denorm_field_name += '_id' denorm_field_names[i] = denorm_field_name source_field_graph = source_graph_fields[ denorm_field_name] = source_graph_fields.get( denorm_field_name, []) source_field_graph.append({ 'target_model': target_model, 'source': source, 'strategy': strategy, 'storage': storage, 'shards': source_dict.get('shards') and util.convert_func_to_string(source_dict['shards']) })
def source_model_post_save(sender, instance, created, **kwargs): # for clarity source_model = sender source_instance = instance affected_targets = source_instance._denorm_affected_targets if not affected_targets: # nothing to denorm return # # create a task for each affected target to update its instances # for target_model, affected_target in affected_targets.iteritems(): # if storage is shared_dict, then task will pluralize related_field_name to get target model's list field related_field_name = affected_target['related'] strategy = affected_target['strategy'] storage = affected_target['storage'] shards = affected_target['shards'] affected_fields = affected_target['fields'] #logging.info('affected target %s.%s for source %s: %s' % (target_model, related_field_name, source_model, affected_fields)) # for each affected target, create a separate task instance_id = source_instance.id tag = 'DENORM_SOURCE_%s_%s_TARGET_%s' % (util.get_model_name(source_model), instance_id, util.get_model_name(target_model)) payload = { 'created': timezone.now().isoformat(), 'strategy': strategy, 'storage': storage, 'instance_id': instance_id, 'source_model': util.get_model_name(source_model), 'target_model': util.get_model_name(target_model), 'related_field': related_field_name, 'fields': affected_fields, # TODO: queue name should be configurable 'queue_name': 'denorm' } if strategy == 'mapreduce': payload['shards'] = handler_for_name(shards)(source_instance) if shards else DEFAULT_MAP_REDUCE_SHARDS payload_string = util.dump_json(payload) logging.info('[denorm source_model_post_save] queue task payload = %s' % payload_string) # create a pull task per target taskqueue.Queue('pull-denorm').add( taskqueue.Task(payload=payload_string, tag=tag, method='PULL') ) # create ** one ** Task model instance used to track denorm tasks per source, particularly for throttling models.get_task_model().objects.create( source_model=util.get_model_name(source_model), source_instance_id=source_instance.id, user=source_instance._denorm_user, label=source_instance._denorm_label ) # re-run post_init to reset _denorm_orig_values in case this instance gets saved again source_model_post_init(source_model, source_instance)
def source_model_pre_save(sender, instance, raw, using, update_fields, **kwargs): # for clarity source_model = sender source_instance = instance created = not source_instance.id source_instance._denorm_affected_targets = affected_targets = {} # newly created instances will not need denormalization if created: return # denorm turned off if not getattr(source_instance, '_denorm', True): return source_graph = core.SOURCE_GRAPH[source_model] # # iterate through all fields to build up set of distinct affected targets that post_save signal receiver will process. # source_graph_fields = source_graph['fields'] orig_values = source_instance._denorm_orig_values for source_field, targets in source_graph_fields.iteritems(): old_value = orig_values[source_field] new_value = getattr(source_instance, source_field) if old_value != new_value: #logging.info('[%s] %s value changed from "%s" to "%s"' % (source_model, source_field, old_value, new_value)) for target in targets: target_model = target['target_model'] related_field_name = target['source'] storage = target['storage'] affected_targets[target_model] = affected_target = affected_targets.get(target_model, { 'related': related_field_name, 'strategy': target['strategy'], 'storage': storage, 'shards': target['shards'], 'fields': {} }) # when task will update target, if storage is scalar, then field name is simply target model field name. # and if storage is shared_dict, then the field name is the dictionary key of the target model's denorm_data field. affected_fields_for_target = affected_target['fields'] affected_fields_for_target['%s_%s' % (related_field_name, source_field)] = new_value if not affected_targets: return # # check that denorm throttling threshold is not exceeded # # get user from thread-local variable set by middleware user = middleware.get_current_user() if not user or not isinstance(user, get_user_model()) or not user.is_authenticated() or user.is_superuser: user = None source_instance._denorm_user = user # get denorm label used for throttling if 'label' in source_graph: # custom label set by application label = source_graph['label'](source_instance, user) else: # default label if user: label = '%s_%s' % (util.get_model_name(source_model), str(user.id)) else: # no label label = None source_instance._denorm_label = label throttles = source_graph.get('throttles') if not label or not throttles: # no throttling return # now validate each throttle # FIXME: we need to figure out if there is already a denorm task scheduled, and if so, then don't penalize throttle. # FIXME: perhaps we can use a Task.status field in combination with filter for source instance id. now = timezone.now() for throttle in throttles: num_requests, duration = util.parse_rate(throttle) # FIXME: this is a naive, inefficient implementation. we should cache task counts. if models.get_task_model().objects.filter(label=label, created__gt=now - timedelta(seconds=duration)).count() >= num_requests: raise exceptions.DenormThrottled