def handle(self, **options): self.days = options['days'] self.concurrency = options['concurrency'] self.project = options['project'] self.stdout.write("Removing expired values for LostPasswordHash\n") LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() if self.project: self.stderr.write("Bulk NodeStore deletion not available for project selection\n") else: self.stdout.write("Removing old NodeStore values\n") cutoff = timezone.now() - timedelta(days=self.days) try: nodestore.cleanup(cutoff) except NotImplementedError: self.stderr.write("NodeStore backend does not support cleanup operation\n") for model, dtfield in self.BULK_DELETES: self.stdout.write("Removing {model} for days={days} project={project}\n".format( model=model.__name__, days=self.days, project=self.project or '*', )) BulkDeleteQuery( model=model, dtfield=dtfield, days=self.days, project_id=self.project, ).execute() # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long self.stdout.write("Removing expired values for EventMapping\n") BulkDeleteQuery( model=EventMapping, dtfield='date_added', days=min(self.days, 7), project_id=self.project, ).execute() for model, dtfield in self.GENERIC_DELETES: self.stdout.write("Removing {model} for days={days} project={project}\n".format( model=model.__name__, days=self.days, project=self.project or '*', )) BulkDeleteQuery( model=model, dtfield=dtfield, days=self.days, project_id=self.project, ).execute_generic()
def cleanup(self, cutoff_timestamp): from sentry.db.deletion import BulkDeleteQuery total_seconds = (timezone.now() - cutoff_timestamp).total_seconds() days = math.floor(total_seconds / 86400) BulkDeleteQuery(model=Node, dtfield="timestamp", days=days).execute()
def test_datetime_restriction(self): now = timezone.now() project1 = self.create_project() group1_1 = self.create_group(project1, last_seen=now - timedelta(days=1)) group1_2 = self.create_group(project1, last_seen=now - timedelta(days=1)) group1_3 = self.create_group(project1, last_seen=now) BulkDeleteQuery(model=Group, dtfield="last_seen", days=1).execute() assert not Group.objects.filter(id=group1_1.id).exists() assert not Group.objects.filter(id=group1_2.id).exists() assert Group.objects.filter(id=group1_3.id).exists()
def test_project_restriction(self): project1 = self.create_project() group1_1 = self.create_group(project1) group1_2 = self.create_group(project1) project2 = self.create_project() group2_1 = self.create_group(project2) group2_2 = self.create_group(project2) BulkDeleteQuery(model=Group, project_id=project1.id).execute() assert Project.objects.filter(id=project1.id).exists() assert Project.objects.filter(id=project2.id).exists() assert Group.objects.filter(id=group2_1.id).exists() assert Group.objects.filter(id=group2_2.id).exists() assert not Group.objects.filter(id=group1_1.id).exists() assert not Group.objects.filter(id=group1_2.id).exists()
def test_iteration(self): target_project = self.project expected_group_ids = set([self.create_group().id for i in range(2)]) other_project = self.create_project() self.create_group(other_project) self.create_group(other_project) iterator = BulkDeleteQuery( model=Group, project_id=target_project.id, dtfield="last_seen", order_by="last_seen", days=0, ).iterator(1) results = set() for chunk in iterator: results.update(chunk) assert results == expected_group_ids
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() os.environ['_SENTRY_CLEANUP'] = '1' # Make sure we fork off multiprocessing pool # before we import or configure the app from multiprocessing import Process, JoinableQueue as Queue pool = [] task_queue = Queue(1000) for _ in xrange(concurrency): p = Process(target=multiprocess_worker, args=(task_queue,)) p.daemon = True p.start() pool.append(p) from sentry.runner import configure configure() from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.EventAttachment, 'date_added', None), (models.UserReport, 'date_added', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() if is_filtered(models.OrganizationMember) and not silent: click.echo('>> Skipping OrganizationMember') else: click.echo('Removing expired values for OrganizationMember') expired_threshold = timezone.now() - timedelta(days=days) models.OrganizationMember.delete_expired(expired_threshold) for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo(u'Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo(u'>> Skipping {}'.format(model.__name__)) else: model.objects.filter( expires_at__lt=(timezone.now() - timedelta(days=days)), ).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo( "NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', ) ) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: imp = '.'.join((model.__module__, model.__name__)) q = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ) for chunk in q.iterator(chunk_size=100): task_queue.put((imp, chunk)) task_queue.join() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) # Shut down our pool for _ in pool: task_queue.put(_STOP_WORKER) # And wait for it to drain for p in pool: p.join() if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() os.environ['_SENTRY_CLEANUP'] = '1' # Make sure we fork off multiprocessing pool # before we import or configure the app from multiprocessing import Process, JoinableQueue as Queue pool = [] task_queue = Queue(1000) for _ in xrange(concurrency): p = Process(target=multiprocess_worker, args=(task_queue, )) p.daemon = True p.start() pool.append(p) from sentry.runner import configure configure() from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.EventAttachment, 'date_added', None), (models.UserReport, 'date_added', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() if is_filtered(models.OrganizationMember) and not silent: click.echo('>> Skipping OrganizationMember') else: if not silent: click.echo('Removing expired values for OrganizationMember') expired_threshold = timezone.now() - timedelta(days=days) models.OrganizationMember.delete_expired(expired_threshold) for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo(u'Removing expired values for {}'.format( model.__name__)) if is_filtered(model): if not silent: click.echo(u'>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=( timezone.now() - timedelta(days=API_TOKEN_TTL_IN_DAYS)), ).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( u"Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: imp = '.'.join((model.__module__, model.__name__)) q = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ) for chunk in q.iterator(chunk_size=100): task_queue.put((imp, chunk)) task_queue.join() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) # Shut down our pool for _ in pool: task_queue.put(_STOP_WORKER) # And wait for it to drain for p in pool: p.join() if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router, sample_rate=1.0) click.echo("Clean up took %s second(s)." % duration)
def cleanup(days, project, concurrency, max_procs, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() import math import multiprocessing import pickle import subprocess import sys from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = [ (models.EventMapping, 'date_added', '-date_added'), (models.GroupHashTombstone, 'deleted_at', None), (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), ] + EXTRA_BULK_QUERY_DELETES # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', 'datetime'), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=timezone.now()).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for bqd in BULK_QUERY_DELETES: if len(bqd) == 4: model, dtfield, order_by, chunk_size = bqd else: chunk_size = 10000 model, dtfield, order_by = bqd if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute(chunk_size=chunk_size) for model, dtfield, order_by in DELETES: if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: if concurrency > 1: shard_ids = range(concurrency) num_procs = min(multiprocessing.cpu_count(), max_procs) threads_per_proc = int( math.ceil(concurrency / float(num_procs))) pids = [] for shard_id_chunk in chunker(shard_ids, threads_per_proc): pid = subprocess.Popen([ sys.argv[0], 'cleanup_chunk', '--days', six.binary_type(days), ] + ( ['--project_id', six.binary_type(project_id)] if project_id else [] ) + [ '--model', pickle.dumps(model), '--dtfield', dtfield, '--order_by', order_by, '--num_shards', six.binary_type(concurrency), '--shard_ids', ",".join([six.binary_type(s) for s in shard_id_chunk]), ]) pids.append(pid) total_pid_count = len(pids) click.echo( "%s concurrent processes forked, waiting on them to complete." % total_pid_count) complete = 0 for pid in pids: pid.wait() complete += 1 click.echo("%s/%s concurrent processes are finished." % (complete, total_pid_count)) else: task = create_deletion_task(days, project_id, model, dtfield, order_by) _chunk_until_complete(task) # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def cleanup(days, project, concurrency): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry.models import ( Event, EventMapping, Group, GroupRuleStatus, GroupTagValue, LostPasswordHash, TagValue, GroupEmailThread, ) # these models should be safe to delete without cascades, in order BULK_DELETES = ( (GroupRuleStatus, 'date_added'), (GroupTagValue, 'last_seen'), (TagValue, 'last_seen'), (GroupEmailThread, 'date'), ) GENERIC_DELETES = ( (Event, 'datetime'), (Group, 'last_seen'), ) click.echo("Removing expired values for LostPasswordHash") LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: click.echo("Removing old NodeStore values") cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for model, dtfield in BULK_DELETES: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ).execute() # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long click.echo("Removing expired values for EventMapping") BulkDeleteQuery( model=EventMapping, dtfield='date_added', days=min(days, 7), project_id=project_id, ).execute() # Clean up FileBLob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) click.echo("Cleaning up unused FileBlob references") cleanup_unused_files() for model, dtfield in GENERIC_DELETES: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ).execute_generic()
def cleanup(days, project, concurrency, silent, model, router, timed): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() from threading import Thread from django.db import router as db_router from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry import deletions from sentry import models if timed: import time from sentry.utils import metrics start_time = time.time() # list of models which this query is restricted to model_list = {m.lower() for m in model} def is_filtered(model): if router is not None and db_router.db_for_write(model) != router: return True if not model_list: return False return model.__name__.lower() not in model_list # Deletions that use `BulkDeleteQuery` (and don't need to worry about child relations) # (model, datetime_field, order_by) BULK_QUERY_DELETES = ( (models.GroupEmailThread, 'date', None), (models.GroupRuleStatus, 'date_added', None), (models.GroupTagValue, 'last_seen', None), (models.TagValue, 'last_seen', None), (models.EventTag, 'date_added', 'date_added'), ) # Deletions that use the `deletions` code path (which handles their child relations) # (model, datetime_field, order_by) DELETES = ( (models.Event, 'datetime', None), (models.Group, 'last_seen', 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered(models.LostPasswordHash): if not silent: click.echo('>> Skipping LostPasswordHash') else: models.LostPasswordHash.objects.filter(date_added__lte=timezone.now() - timedelta(hours=48)).delete() for model in [models.ApiGrant, models.ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter(expires_at__lt=timezone.now()).delete() project_id = None if project: click.echo( "Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") else: cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo( "NodeStore backend does not support cleanup operation", err=True) for model, dtfield, order_by in BULK_QUERY_DELETES: if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, order_by=order_by, ).execute() for model, dtfield, order_by in DELETES: if not silent: click.echo( "Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model): if not silent: click.echo('>> Skipping %s' % model.__name__) else: query = { '{}__lte'.format(dtfield): (timezone.now() - timedelta(days=days)), } if project_id: if 'project' in model._meta.get_all_field_names(): query['project'] = project_id else: query['project_id'] = project_id task = deletions.get( model=model, query=query, order_by=order_by, transaction_id=uuid4().hex, ) def _chunk_until_complete(num_shards=None, shard_id=None): has_more = True while has_more: has_more = task.chunk(num_shards=num_shards, shard_id=shard_id) if concurrency > 1: threads = [] for shard_id in range(concurrency): t = Thread(target=( lambda shard_id=shard_id: _chunk_until_complete( num_shards=concurrency, shard_id=shard_id))) t.start() threads.append(t) for t in threads: t.join() else: _chunk_until_complete() # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long if not silent: click.echo("Removing expired values for EventMapping") if is_filtered(models.EventMapping): if not silent: click.echo('>> Skipping EventMapping') else: BulkDeleteQuery(model=models.EventMapping, dtfield='date_added', days=min(days, 7), project_id=project_id, order_by='-date_added').execute() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered(models.FileBlob): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) if timed: duration = int(time.time() - start_time) metrics.timing('cleanup.duration', duration, instance=router) click.echo("Clean up took %s second(s)." % duration)
def auto_remove_inbox(): BulkDeleteQuery(model=GroupInbox, days=7, dtfield="date_added").execute()
def cleanup(days, project, concurrency, silent, model): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() from threading import Thread from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry.models import ( ApiGrant, ApiToken, Event, EventMapping, Group, GroupRuleStatus, GroupTagValue, LostPasswordHash, TagValue, GroupEmailThread, ) models = {m.lower() for m in model} def is_filtered(model): if not models: return False return model.lower() not in models # these models should be safe to delete without cascades, in order BULK_DELETES = ( (GroupRuleStatus, 'date_added'), (GroupTagValue, 'last_seen'), (TagValue, 'last_seen'), (GroupEmailThread, 'date'), ) GENERIC_DELETES = ( (Event, 'datetime'), (Group, 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered('LostPasswordHash'): if not silent: click.echo('>> Skipping LostPasswordHash') else: LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() for model in [ApiGrant, ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter( expires_at__lt=timezone.now() ).delete() project_id = None if project: click.echo("Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") if is_filtered('NodeStore'): if not silent: click.echo('>> Skipping NodeStore') else: cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for model, dtfield in BULK_DELETES: if not silent: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ).execute() # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long if not silent: click.echo("Removing expired values for EventMapping") if is_filtered('EventMapping'): if not silent: click.echo('>> Skipping EventMapping') else: query = BulkDeleteQuery( model=EventMapping, dtfield='date_added', days=min(days, 7), project_id=project_id, ) if concurrency > 1: click.echo("Running concurrent %d threads" % concurrency) threads = [] for shard_id in range(concurrency): t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id)) t.start() threads.append(t) for t in threads: t.join() click.echo("OK! concurrent %d threads" % concurrency) else: query.execute_generic() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered('FileBlob'): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) for model, dtfield in GENERIC_DELETES: if not silent: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping %s' % model.__name__) else: query = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ) if concurrency > 1: threads = [] for shard_id in range(concurrency): t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id)) t.start() threads.append(t) for t in threads: t.join() else: query.execute_generic()
def cleanup(days, project, concurrency, silent, model): """Delete a portion of trailing data based on creation date. All data that is older than `--days` will be deleted. The default for this is 30 days. In the default setting all projects will be truncated but if you have a specific project you want to limit this to this can be done with the `--project` flag which accepts a project ID or a string with the form `org/project` where both are slugs. """ if concurrency < 1: click.echo('Error: Minimum concurrency is 1', err=True) raise click.Abort() from threading import Thread from sentry.app import nodestore from sentry.db.deletion import BulkDeleteQuery from sentry.models import ( ApiGrant, ApiToken, Event, EventMapping, Group, GroupRuleStatus, GroupTagValue, LostPasswordHash, TagValue, GroupEmailThread, ) models = {m.lower() for m in model} def is_filtered(model): if not models: return False return model.lower() not in models # these models should be safe to delete without cascades, in order BULK_DELETES = ( (GroupRuleStatus, 'date_added'), (GroupTagValue, 'last_seen'), (TagValue, 'last_seen'), (GroupEmailThread, 'date'), ) GENERIC_DELETES = ( (Event, 'datetime'), (Group, 'last_seen'), ) if not silent: click.echo('Removing expired values for LostPasswordHash') if is_filtered('LostPasswordHash'): if not silent: click.echo('>> Skipping LostPasswordHash') else: LostPasswordHash.objects.filter( date_added__lte=timezone.now() - timedelta(hours=48) ).delete() for model in [ApiGrant, ApiToken]: if not silent: click.echo('Removing expired values for {}'.format(model.__name__)) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping {}'.format(model.__name__)) else: model.objects.filter( expires_at__lt=timezone.now() ).delete() project_id = None if project: click.echo("Bulk NodeStore deletion not available for project selection", err=True) project_id = get_project(project) if project_id is None: click.echo('Error: Project not found', err=True) raise click.Abort() else: if not silent: click.echo("Removing old NodeStore values") if is_filtered('NodeStore'): if not silent: click.echo('>> Skipping NodeStore') else: cutoff = timezone.now() - timedelta(days=days) try: nodestore.cleanup(cutoff) except NotImplementedError: click.echo("NodeStore backend does not support cleanup operation", err=True) for model, dtfield in BULK_DELETES: if not silent: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping %s' % model.__name__) else: BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ).execute() # EventMapping is fairly expensive and is special cased as it's likely you # won't need a reference to an event for nearly as long if not silent: click.echo("Removing expired values for EventMapping") if is_filtered('EventMapping'): if not silent: click.echo('>> Skipping EventMapping') else: BulkDeleteQuery( model=EventMapping, dtfield='date_added', days=min(days, 7), project_id=project_id, ).execute() # Clean up FileBlob instances which are no longer used and aren't super # recent (as there could be a race between blob creation and reference) if not silent: click.echo("Cleaning up unused FileBlob references") if is_filtered('FileBlob'): if not silent: click.echo('>> Skipping FileBlob') else: cleanup_unused_files(silent) for model, dtfield in GENERIC_DELETES: if not silent: click.echo("Removing {model} for days={days} project={project}".format( model=model.__name__, days=days, project=project or '*', )) if is_filtered(model.__name__): if not silent: click.echo('>> Skipping %s' % model.__name__) else: query = BulkDeleteQuery( model=model, dtfield=dtfield, days=days, project_id=project_id, ) if concurrency > 1: threads = [] for shard_id in range(concurrency): t = Thread(target=lambda shard_id=shard_id: query.execute_sharded(concurrency, shard_id)) t.start() threads.append(t) for t in threads: t.join() else: query.execute_generic()