Example #1
0
    def _execute(self):
        num_commands = len(self._commands)
        if num_commands == 0:
            self._results = []
            return

        pool = None

        for command in self._commands:
            if self._cluster.router:
                db_nums = self._cluster.router.get_db(self._cluster, command._attr, *command._args, **command._kwargs)
            else:
                db_nums = range(len(self._cluster))

            num_commands += len(db_nums)

            # Don't bother with the ThreadPool if we only need to do one operation
            if num_commands == 1:
                self._results = [getattr(self._cluster[db_num], command._attr)(*command._args, **command._kwargs) for db_num in db_nums]
                return

            elif not pool:
                pool = ThreadPool(self._workers)

            for db_num in db_nums:
                pool.add(command._ident, getattr(self._cluster[db_num], command._attr), command._args, command._kwargs)

        result_map = pool.join()
        for command in self._commands:
            result = result_map[command._ident]
            if len(result) == 1:
                result = result[0]
            command._wrapped = result

        self._complete = True
Example #2
0
    def generic_delete(self, model, dtfield, days=None, chunk_size=1000):
        if days is None:
            days = self.days

        cutoff = timezone.now() - timedelta(days=days)

        qs = model.objects.filter(**{'%s__lte' % (dtfield,): cutoff})
        if self.project:
            qs = qs.filter(project=self.project)

        # XXX: we step through because the deletion collector will pull all
        # relations into memory
        count = 0
        while qs.exists():
            # TODO(dcramer): change this to delete by chunks of IDs and utilize
            # bulk_delete_objects
            self.stdout.write("Removing {model} chunk {count}\n".format(
                model=model.__name__,
                count=count,
            ))
            if self.concurrency > 1:
                worker_pool = ThreadPool(workers=self.concurrency)
                for obj in qs[:chunk_size].iterator():
                    worker_pool.add(obj.id, delete_object, [obj])
                    count += 1
                worker_pool.join()
                del worker_pool
            else:
                for obj in qs[:chunk_size].iterator():
                    delete_object(obj)
                    count += 1
Example #3
0
    def generic_delete(self, model, dtfield, days=None, chunk_size=1000):
        if days is None:
            days = self.days

        cutoff = timezone.now() - timedelta(days=days)

        qs = model.objects.filter(**{'%s__lte' % (dtfield, ): cutoff})
        if self.project:
            qs = qs.filter(project=self.project)

        # XXX: we step through because the deletion collector will pull all
        # relations into memory
        count = 0
        while qs.exists():
            # TODO(dcramer): change this to delete by chunks of IDs and utilize
            # bulk_delete_objects
            self.stdout.write("Removing {model} chunk {count}\n".format(
                model=model.__name__,
                count=count,
            ))
            if self.concurrency > 1:
                worker_pool = ThreadPool(workers=self.concurrency)
                for obj in qs[:chunk_size].iterator():
                    worker_pool.add(obj.id, delete_object, [obj])
                    count += 1
                worker_pool.join()
                del worker_pool
            else:
                for obj in qs[:chunk_size].iterator():
                    delete_object(obj)
                    count += 1
Example #4
0
    def _execute(self):
        num_commands = len(self._commands)
        if num_commands == 0:
            self._results = []
            return
        command_map = {}
        piped_dbs = defaultdict(list)
        pool = None

        # 0x00 build up a collection of pipes
        pipes = getattr(self._cluster, 'pipeline')()

        # 0x01 run commands on all pipes
        for command in self._commands:
            command_map[command._ident] = command
            if self._cluster.router:
                db_nums = self._cluster.router.get_db(self._cluster, command._attr, *command._args, **command._kwargs)
            else:
                db_nums = range(len(self._cluster))
            # update the pipelined dbs
            [piped_dbs[n].append(command._ident) for n in db_nums]
            # add to pipeline
            [getattr(pipes[n], command._attr)(*command._args, **command._kwargs) for n in db_nums]

        # 0x02 execute pipes in thread pool
        pool = ThreadPool(self._workers)
        [pool.add(db, getattr(pipes[db], 'execute'), (), {}) for db, command_lst in piped_dbs.iteritems()]

        # 0x03 consolidate commands with their appropriate results
        result_map = pool.join()
        for db, result in result_map.iteritems():
            if len(result) == 1:
                result = result[0]
            for i, value in enumerate(result):
                command_map[piped_dbs[db][i]]._wrapped = value

        self._complete = True
Example #5
0
    def _execute(self):
        num_commands = len(self._commands)
        if num_commands == 0:
            self._commands = []
            return

        command_map = {}
        pipelined = all(self._cluster[n].supports_pipelines
                        for n in self._cluster)
        pending_commands = defaultdict(list)

        # used in pipelining
        if pipelined:
            pipe_command_map = defaultdict(list)

            pipes = dict()  # db -> pipeline

        # build up a list of pending commands and their routing information
        for command in self._commands:
            cmd_ident = command._ident

            command_map[cmd_ident] = command

            if self._cluster.router:
                db_nums = self._cluster.router.get_db(self._cluster,
                                                      command._attr,
                                                      *command._args,
                                                      **command._kwargs)
            else:
                db_nums = range(len(self._cluster))

            # The number of commands is based on the total number of executable commands
            num_commands += len(db_nums)

            # Don't bother with the pooling if we only need to do one operation on a single machine
            if num_commands == 1:
                self._commands = [
                    command._execute(self._cluster[n]) for n in n
                ]
                return

            # update the pipelined dbs
            for db_num in db_nums:
                # map the ident to a db
                if pipelined:
                    pipe_command_map[db_num].append(cmd_ident)

                # add to pending commands
                pending_commands[db_num].append(command)

        # Create the threadpool and pipe jobs into it
        pool = ThreadPool(min(self._workers, len(pending_commands)))

        # execute our pending commands either in the pool, or using a pipeline
        for db_num, command_list in pending_commands.iteritems():
            if pipelined:
                pipes[db_num] = self._cluster[db_num].get_pipeline()
            for command in command_list:
                if pipelined:
                    # add to pipeline
                    pipes[db_num].add(command)
                else:
                    # execute in pool
                    pool.add(command._ident, command._execute,
                             [self._cluster[db_num]])

        # We need to finalize our commands with a single execute in pipelines
        if pipelined:
            for db, pipe in pipes.iteritems():
                pool.add(db, pipe.execute, (), {})

        # Consolidate commands with their appropriate results
        result_map = pool.join()

        # Results get grouped by their command signature, so we have to separate the logic
        if pipelined:
            for db, result in result_map.iteritems():
                if len(result) == 1:
                    result = result[0]
                for i, value in enumerate(result):
                    command_map[pipe_command_map[db][i]]._set_value(value)

        else:
            for command in self._commands:
                result = result_map[command._ident]
                if len(result) == 1:
                    result = result[0]
                command._set_value(result)

        self._complete = True
Example #6
0
def cleanup(days=30, project=None, chunk_size=1000, concurrency=1, **kwargs):
    """
    Deletes a portion of the trailing data in Sentry based on
    their creation dates. For example, if ``days`` is 30, this
    would attempt to clean up all data that's older than 30 days.

    :param project: limit all deletion scopes to messages that are part
                    of the given project
    """
    import datetime

    from django.utils import timezone

    from sentry import app
    # TODO: TagKey and GroupTagKey need cleaned up
    from sentry.models import (
        Group, GroupRuleStatus, Event, EventMapping,
        GroupTagValue, TagValue, Alert,
        Activity, LostPasswordHash)
    from sentry.search.django.models import SearchDocument

    GENERIC_DELETES = (
        (SearchDocument, 'date_changed'),
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (Event, 'datetime'),
        (Activity, 'datetime'),
        (TagValue, 'last_seen'),
        (Alert, 'datetime'),
        (EventMapping, 'date_added'),
        # Group should probably be last
        (Group, 'last_seen'),
    )

    ts = timezone.now() - datetime.timedelta(days=days)

    logger.info("Removing expired values for LostPasswordHash")
    LostPasswordHash.objects.filter(
        date_added__lte=timezone.now() - datetime.timedelta(hours=48)
    ).delete()

    # TODO: we should move this into individual backends
    if not project:
        logger.info("Removing old Node values")
        try:
            app.nodestore.cleanup(ts)
        except NotImplementedError:
            logger.warning("Node backend does not support cleanup operation")

    # Remove types which can easily be bound to project + date
    for model, date_col in GENERIC_DELETES:
        logger.info("Removing %s for days=%s project=%s", model.__name__, days, project or '*')
        qs = model.objects.filter(**{'%s__lte' % (date_col,): ts})
        if project:
            qs = qs.filter(project=project)
        # XXX: we step through because the deletion collector will pull all relations into memory

        count = 0
        while qs.exists():
            logger.info("Removing %s chunk %d", model.__name__, count)
            if concurrency > 1:
                worker_pool = ThreadPool(workers=concurrency)
                for obj in qs[:chunk_size].iterator():
                    worker_pool.add(obj.id, delete_object, [obj])
                    count += 1
                worker_pool.join()
                del worker_pool
            else:
                for obj in qs[:chunk_size].iterator():
                    delete_object(obj)

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    if days > 7:
        logger.info("Removing expired values for EventMapping")
        EventMapping.objects.filter(
            date_added__lte=timezone.now() - datetime.timedelta(days=7)
        ).delete()
Example #7
0
File: map.py Project: tupy/nydus
 def get_pool(self, commands):
     return ThreadPool(min(self._workers, len(commands)))
Example #8
0
    def _execute(self):
        num_commands = len(self._commands)
        if num_commands == 0:
            self._commands = []
            return

        command_map = {}
        pipelined = all(self._cluster[n].supports_pipelines for n in self._cluster)
        pending_commands = defaultdict(list)

        # used in pipelining
        if pipelined:
            pipe_command_map = defaultdict(list)

            pipes = dict()  # db -> pipeline

        # build up a list of pending commands and their routing information
        for command in self._commands:
            cmd_ident = command._ident

            command_map[cmd_ident] = command

            if self._cluster.router:
                db_nums = self._cluster.router.get_dbs(self._cluster, command._attr, *command._args, **command._kwargs)
            else:
                db_nums = self._cluster.keys()

            # The number of commands is based on the total number of executable commands
            num_commands += len(db_nums)

            # Don't bother with the pooling if we only need to do one operation on a single machine
            if num_commands == 1:
                self._commands = [command._execute(self._cluster[n]) for n in n]
                return

            # update the pipelined dbs
            for db_num in db_nums:
                # map the ident to a db
                if pipelined:
                    pipe_command_map[db_num].append(cmd_ident)

                # add to pending commands
                pending_commands[db_num].append(command)

        # Create the threadpool and pipe jobs into it
        pool = ThreadPool(min(self._workers, len(pending_commands)))

        # execute our pending commands either in the pool, or using a pipeline
        for db_num, command_list in pending_commands.iteritems():
            if pipelined:
                pipes[db_num] = self._cluster[db_num].get_pipeline()
            for command in command_list:
                if pipelined:
                    # add to pipeline
                    pipes[db_num].add(command)
                else:
                    # execute in pool
                    pool.add(command._ident, command._execute, [self._cluster[db_num]])

        # We need to finalize our commands with a single execute in pipelines
        if pipelined:
            for db, pipe in pipes.iteritems():
                pool.add(db, pipe.execute, (), {})

        # Consolidate commands with their appropriate results
        result_map = pool.join()

        # Results get grouped by their command signature, so we have to separate the logic
        if pipelined:
            for db, result in result_map.iteritems():
                if len(result) == 1:
                    result = result[0]
                for i, value in enumerate(result):
                    command_map[pipe_command_map[db][i]]._set_value(value)

        else:
            for command in self._commands:
                result = result_map[command._ident]
                if len(result) == 1:
                    result = result[0]
                command._set_value(result)

        self._complete = True
Example #9
0
def cleanup(days=30, project=None, chunk_size=1000, concurrency=1, **kwargs):
    """
    Deletes a portion of the trailing data in Sentry based on
    their creation dates. For example, if ``days`` is 30, this
    would attempt to clean up all data that's older than 30 days.

    :param project: limit all deletion scopes to messages that are part
                    of the given project
    """
    import datetime

    from django.utils import timezone

    from sentry import app
    # TODO: TagKey and GroupTagKey need cleaned up
    from sentry.models import (
        Group, GroupRuleStatus, Event, EventMapping,
        GroupTagValue, TagValue, Alert,
        Activity, LostPasswordHash)
    from sentry.search.django.models import SearchDocument

    GENERIC_DELETES = (
        (SearchDocument, 'date_changed'),
        (GroupRuleStatus, 'date_added'),
        (GroupTagValue, 'last_seen'),
        (Event, 'datetime'),
        (Activity, 'datetime'),
        (TagValue, 'last_seen'),
        (Alert, 'datetime'),
        (EventMapping, 'date_added'),
        # Group should probably be last
        (Group, 'last_seen'),
    )

    log = cleanup.get_logger()

    ts = timezone.now() - datetime.timedelta(days=days)

    log.info("Removing expired values for LostPasswordHash")
    LostPasswordHash.objects.filter(
        date_added__lte=timezone.now() - datetime.timedelta(hours=48)
    ).delete()

    # TODO: we should move this into individual backends
    log.info("Removing old Node values")
    try:
        app.nodestore.cleanup(ts)
    except NotImplementedError:
        log.warning("Node backend does not support cleanup operation")

    # Remove types which can easily be bound to project + date
    for model, date_col in GENERIC_DELETES:
        log.info("Removing %s for days=%s project=%s", model.__name__, days, project or '*')
        qs = model.objects.filter(**{'%s__lte' % (date_col,): ts})
        if project:
            qs = qs.filter(project=project)
        # XXX: we step through because the deletion collector will pull all relations into memory

        count = 0
        while qs.exists():
            log.info("Removing %s chunk %d", model.__name__, count)
            if concurrency > 1:
                worker_pool = ThreadPool(workers=concurrency)
                for obj in qs[:chunk_size].iterator():
                    worker_pool.add(obj.id, delete_object, [obj])
                    count += 1
                worker_pool.join()
                del worker_pool
            else:
                for obj in qs[:chunk_size].iterator():
                    delete_object(obj)

    # EventMapping is fairly expensive and is special cased as it's likely you
    # won't need a reference to an event for nearly as long
    if days > 7:
        log.info("Removing expired values for EventMapping")
        EventMapping.objects.filter(
            date_added__lte=timezone.now() - datetime.timedelta(days=7)
        ).delete()