def _execute(self): num_commands = len(self._commands) if num_commands == 0: self._results = [] return pool = None for command in self._commands: if self._cluster.router: db_nums = self._cluster.router.get_db(self._cluster, command._attr, *command._args, **command._kwargs) else: db_nums = range(len(self._cluster)) num_commands += len(db_nums) # Don't bother with the ThreadPool if we only need to do one operation if num_commands == 1: self._results = [getattr(self._cluster[db_num], command._attr)(*command._args, **command._kwargs) for db_num in db_nums] return elif not pool: pool = ThreadPool(self._workers) for db_num in db_nums: pool.add(command._ident, getattr(self._cluster[db_num], command._attr), command._args, command._kwargs) result_map = pool.join() for command in self._commands: result = result_map[command._ident] if len(result) == 1: result = result[0] command._wrapped = result self._complete = True
def generic_delete(self, model, dtfield, days=None, chunk_size=1000): if days is None: days = self.days cutoff = timezone.now() - timedelta(days=days) qs = model.objects.filter(**{'%s__lte' % (dtfield,): cutoff}) if self.project: qs = qs.filter(project=self.project) # XXX: we step through because the deletion collector will pull all # relations into memory count = 0 while qs.exists(): # TODO(dcramer): change this to delete by chunks of IDs and utilize # bulk_delete_objects self.stdout.write("Removing {model} chunk {count}\n".format( model=model.__name__, count=count, )) if self.concurrency > 1: worker_pool = ThreadPool(workers=self.concurrency) for obj in qs[:chunk_size].iterator(): worker_pool.add(obj.id, delete_object, [obj]) count += 1 worker_pool.join() del worker_pool else: for obj in qs[:chunk_size].iterator(): delete_object(obj) count += 1
def generic_delete(self, model, dtfield, days=None, chunk_size=1000): if days is None: days = self.days cutoff = timezone.now() - timedelta(days=days) qs = model.objects.filter(**{'%s__lte' % (dtfield, ): cutoff}) if self.project: qs = qs.filter(project=self.project) # XXX: we step through because the deletion collector will pull all # relations into memory count = 0 while qs.exists(): # TODO(dcramer): change this to delete by chunks of IDs and utilize # bulk_delete_objects self.stdout.write("Removing {model} chunk {count}\n".format( model=model.__name__, count=count, )) if self.concurrency > 1: worker_pool = ThreadPool(workers=self.concurrency) for obj in qs[:chunk_size].iterator(): worker_pool.add(obj.id, delete_object, [obj]) count += 1 worker_pool.join() del worker_pool else: for obj in qs[:chunk_size].iterator(): delete_object(obj) count += 1
def _execute(self): num_commands = len(self._commands) if num_commands == 0: self._results = [] return command_map = {} piped_dbs = defaultdict(list) pool = None # 0x00 build up a collection of pipes pipes = getattr(self._cluster, 'pipeline')() # 0x01 run commands on all pipes for command in self._commands: command_map[command._ident] = command if self._cluster.router: db_nums = self._cluster.router.get_db(self._cluster, command._attr, *command._args, **command._kwargs) else: db_nums = range(len(self._cluster)) # update the pipelined dbs [piped_dbs[n].append(command._ident) for n in db_nums] # add to pipeline [getattr(pipes[n], command._attr)(*command._args, **command._kwargs) for n in db_nums] # 0x02 execute pipes in thread pool pool = ThreadPool(self._workers) [pool.add(db, getattr(pipes[db], 'execute'), (), {}) for db, command_lst in piped_dbs.iteritems()] # 0x03 consolidate commands with their appropriate results result_map = pool.join() for db, result in result_map.iteritems(): if len(result) == 1: result = result[0] for i, value in enumerate(result): command_map[piped_dbs[db][i]]._wrapped = value self._complete = True
def _execute(self): num_commands = len(self._commands) if num_commands == 0: self._commands = [] return command_map = {} pipelined = all(self._cluster[n].supports_pipelines for n in self._cluster) pending_commands = defaultdict(list) # used in pipelining if pipelined: pipe_command_map = defaultdict(list) pipes = dict() # db -> pipeline # build up a list of pending commands and their routing information for command in self._commands: cmd_ident = command._ident command_map[cmd_ident] = command if self._cluster.router: db_nums = self._cluster.router.get_db(self._cluster, command._attr, *command._args, **command._kwargs) else: db_nums = range(len(self._cluster)) # The number of commands is based on the total number of executable commands num_commands += len(db_nums) # Don't bother with the pooling if we only need to do one operation on a single machine if num_commands == 1: self._commands = [ command._execute(self._cluster[n]) for n in n ] return # update the pipelined dbs for db_num in db_nums: # map the ident to a db if pipelined: pipe_command_map[db_num].append(cmd_ident) # add to pending commands pending_commands[db_num].append(command) # Create the threadpool and pipe jobs into it pool = ThreadPool(min(self._workers, len(pending_commands))) # execute our pending commands either in the pool, or using a pipeline for db_num, command_list in pending_commands.iteritems(): if pipelined: pipes[db_num] = self._cluster[db_num].get_pipeline() for command in command_list: if pipelined: # add to pipeline pipes[db_num].add(command) else: # execute in pool pool.add(command._ident, command._execute, [self._cluster[db_num]]) # We need to finalize our commands with a single execute in pipelines if pipelined: for db, pipe in pipes.iteritems(): pool.add(db, pipe.execute, (), {}) # Consolidate commands with their appropriate results result_map = pool.join() # Results get grouped by their command signature, so we have to separate the logic if pipelined: for db, result in result_map.iteritems(): if len(result) == 1: result = result[0] for i, value in enumerate(result): command_map[pipe_command_map[db][i]]._set_value(value) else: for command in self._commands: result = result_map[command._ident] if len(result) == 1: result = result[0] command._set_value(result) self._complete = True
def cleanup(days=30, project=None, chunk_size=1000, concurrency=1, **kwargs): """ Deletes a portion of the trailing data in Sentry based on their creation dates. For example, if ``days`` is 30, this would attempt to clean up all data that's older than 30 days. :param project: limit all deletion scopes to messages that are part of the given project """ import datetime from django.utils import timezone from sentry import app # TODO: TagKey and GroupTagKey need cleaned up from sentry.models import ( Group, GroupRuleStatus, Event, EventMapping, GroupTagValue, TagValue, Alert, Activity, LostPasswordHash) from sentry.search.django.models import SearchDocument GENERIC_DELETES = ( (SearchDocument, 'date_changed'), (GroupRuleStatus, 'date_added'), (GroupTagValue, 'last_seen'), (Event, 'datetime'), (Activity, 'datetime'), (TagValue, 'last_seen'), (Alert, 'datetime'), (EventMapping, 'date_added'), # Group should probably be last (Group, 'last_seen'), ) ts = timezone.now() - datetime.timedelta(days=days) logger.info("Removing expired values for LostPasswordHash") LostPasswordHash.objects.filter( date_added__lte=timezone.now() - datetime.timedelta(hours=48) ).delete() # TODO: we should move this into individual backends if not project: logger.info("Removing old Node values") try: app.nodestore.cleanup(ts) except NotImplementedError: logger.warning("Node backend does not support cleanup operation") # Remove types which can easily be bound to project + date for model, date_col in GENERIC_DELETES: logger.info("Removing %s for days=%s project=%s", model.__name__, days, project or '*') qs = model.objects.filter(**{'%s__lte' % (date_col,): ts}) if project: qs = qs.filter(project=project) # XXX: we step through because the deletion collector will pull all relations into memory count = 0 while qs.exists(): logger.info("Removing %s chunk %d", model.__name__, count) if concurrency > 1: worker_pool = ThreadPool(workers=concurrency) for obj in qs[:chunk_size].iterator(): worker_pool.add(obj.id, delete_object, [obj]) count += 1 worker_pool.join() del worker_pool else: for obj in qs[:chunk_size].iterator(): delete_object(obj) # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long if days > 7: logger.info("Removing expired values for EventMapping") EventMapping.objects.filter( date_added__lte=timezone.now() - datetime.timedelta(days=7) ).delete()
def get_pool(self, commands): return ThreadPool(min(self._workers, len(commands)))
def _execute(self): num_commands = len(self._commands) if num_commands == 0: self._commands = [] return command_map = {} pipelined = all(self._cluster[n].supports_pipelines for n in self._cluster) pending_commands = defaultdict(list) # used in pipelining if pipelined: pipe_command_map = defaultdict(list) pipes = dict() # db -> pipeline # build up a list of pending commands and their routing information for command in self._commands: cmd_ident = command._ident command_map[cmd_ident] = command if self._cluster.router: db_nums = self._cluster.router.get_dbs(self._cluster, command._attr, *command._args, **command._kwargs) else: db_nums = self._cluster.keys() # The number of commands is based on the total number of executable commands num_commands += len(db_nums) # Don't bother with the pooling if we only need to do one operation on a single machine if num_commands == 1: self._commands = [command._execute(self._cluster[n]) for n in n] return # update the pipelined dbs for db_num in db_nums: # map the ident to a db if pipelined: pipe_command_map[db_num].append(cmd_ident) # add to pending commands pending_commands[db_num].append(command) # Create the threadpool and pipe jobs into it pool = ThreadPool(min(self._workers, len(pending_commands))) # execute our pending commands either in the pool, or using a pipeline for db_num, command_list in pending_commands.iteritems(): if pipelined: pipes[db_num] = self._cluster[db_num].get_pipeline() for command in command_list: if pipelined: # add to pipeline pipes[db_num].add(command) else: # execute in pool pool.add(command._ident, command._execute, [self._cluster[db_num]]) # We need to finalize our commands with a single execute in pipelines if pipelined: for db, pipe in pipes.iteritems(): pool.add(db, pipe.execute, (), {}) # Consolidate commands with their appropriate results result_map = pool.join() # Results get grouped by their command signature, so we have to separate the logic if pipelined: for db, result in result_map.iteritems(): if len(result) == 1: result = result[0] for i, value in enumerate(result): command_map[pipe_command_map[db][i]]._set_value(value) else: for command in self._commands: result = result_map[command._ident] if len(result) == 1: result = result[0] command._set_value(result) self._complete = True
def cleanup(days=30, project=None, chunk_size=1000, concurrency=1, **kwargs): """ Deletes a portion of the trailing data in Sentry based on their creation dates. For example, if ``days`` is 30, this would attempt to clean up all data that's older than 30 days. :param project: limit all deletion scopes to messages that are part of the given project """ import datetime from django.utils import timezone from sentry import app # TODO: TagKey and GroupTagKey need cleaned up from sentry.models import ( Group, GroupRuleStatus, Event, EventMapping, GroupTagValue, TagValue, Alert, Activity, LostPasswordHash) from sentry.search.django.models import SearchDocument GENERIC_DELETES = ( (SearchDocument, 'date_changed'), (GroupRuleStatus, 'date_added'), (GroupTagValue, 'last_seen'), (Event, 'datetime'), (Activity, 'datetime'), (TagValue, 'last_seen'), (Alert, 'datetime'), (EventMapping, 'date_added'), # Group should probably be last (Group, 'last_seen'), ) log = cleanup.get_logger() ts = timezone.now() - datetime.timedelta(days=days) log.info("Removing expired values for LostPasswordHash") LostPasswordHash.objects.filter( date_added__lte=timezone.now() - datetime.timedelta(hours=48) ).delete() # TODO: we should move this into individual backends log.info("Removing old Node values") try: app.nodestore.cleanup(ts) except NotImplementedError: log.warning("Node backend does not support cleanup operation") # Remove types which can easily be bound to project + date for model, date_col in GENERIC_DELETES: log.info("Removing %s for days=%s project=%s", model.__name__, days, project or '*') qs = model.objects.filter(**{'%s__lte' % (date_col,): ts}) if project: qs = qs.filter(project=project) # XXX: we step through because the deletion collector will pull all relations into memory count = 0 while qs.exists(): log.info("Removing %s chunk %d", model.__name__, count) if concurrency > 1: worker_pool = ThreadPool(workers=concurrency) for obj in qs[:chunk_size].iterator(): worker_pool.add(obj.id, delete_object, [obj]) count += 1 worker_pool.join() del worker_pool else: for obj in qs[:chunk_size].iterator(): delete_object(obj) # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long if days > 7: log.info("Removing expired values for EventMapping") EventMapping.objects.filter( date_added__lte=timezone.now() - datetime.timedelta(days=7) ).delete()