Exemple #1
0
def get_latest_events(group_hash_list):
    """
    Fetch the latest events for a collection of ``GroupHash`` instances.
    Returns a list of events (or ``None``) in the same order as the input
    sequence.
    """
    group_hashes_by_project_id = defaultdict(list)
    for group_hash in group_hash_list:
        group_hashes_by_project_id[group_hash.project_id].append(group_hash)

    events_by_group_hash = {}
    for project_id, group_hash_list_chunk in group_hashes_by_project_id.items(
    ):
        event_id_list = GroupHash.fetch_last_processed_event_id(
            [i.id for i in group_hash_list_chunk])
        event_by_event_id = {
            event.event_id: event
            for event in Event.objects.filter(
                project_id=project_id,
                event_id__in=filter(None, event_id_list),
            )
        }
        for group_hash, event_id in zip(group_hash_list_chunk, event_id_list):
            event = event_by_event_id.get(event_id)
            if event is not None and event.group_id == group_hash.group_id:
                events_by_group_hash[group_hash] = event

    return [
        events_by_group_hash.get(group_hash) for group_hash in group_hash_list
    ]
Exemple #2
0
def get_latest_events(group_hash_list):
    """
    Fetch the latest events for a collection of ``GroupHash`` instances.
    Returns a list of events (or ``None``) in the same order as the input
    sequence.
    """
    group_hashes_by_project_id = defaultdict(list)
    for group_hash in group_hash_list:
        group_hashes_by_project_id[group_hash.project_id].append(group_hash)

    events_by_group_hash = {}
    for project_id, group_hash_list_chunk in group_hashes_by_project_id.items():
        event_id_list = GroupHash.fetch_last_processed_event_id(
            [i.id for i in group_hash_list_chunk])
        event_by_event_id = {
            event.event_id: event
            for event in Event.objects.filter(
                project_id=project_id,
                event_id__in=filter(None, event_id_list),
            )
        }
        for group_hash, event_id in zip(group_hash_list_chunk, event_id_list):
            event = event_by_event_id.get(event_id)
            if event is not None and event.group_id == group_hash.group_id:
                events_by_group_hash[group_hash] = event

    return [events_by_group_hash.get(group_hash) for group_hash in group_hash_list]
    def test_missing_latest_event(self):
        user = self.create_user()
        group = self.create_group()
        hash = GroupHash.objects.create(
            project=group.project,
            group=group,
            hash='xyz',
        )

        GroupHash.record_last_processed_event_id(
            hash.id,
            ['invalid'],
        )

        result = serialize(hash, user=user)
        assert result['latestEvent'] is None
Exemple #4
0
    def test_fetch_and_record_last_processed_event_id(self):
        group = self.group

        grouphash = GroupHash.objects.create(
            project=group.project,
            group=group,
            hash='xyz',
        )

        GroupHash.record_last_processed_event_id(
            grouphash.id,
            'event',
        )

        assert GroupHash.fetch_last_processed_event_id(
            [grouphash.id, -1],
        ) == ['event', None]
    def test_valid_latest_event(self):
        user = self.create_user()
        group = self.create_group()
        hash = GroupHash.objects.create(
            project=group.project,
            group=group,
            hash='xyz',
        )
        event = Event.objects.get(id=self.create_event(group=group).id)

        GroupHash.record_last_processed_event_id(
            hash.id,
            event.event_id,
        )

        result = serialize(hash, user=user)
        assert result['latestEvent'] == serialize(event, user=user)
Exemple #6
0
    def test_mismatched_latest_event(self):
        user = self.create_user()
        group = self.create_group()
        hash = GroupHash.objects.create(
            project=group.project,
            group=group,
            hash='xyz',
        )
        event = self.create_event(group=self.create_group())

        GroupHash.record_last_processed_event_id(
            group.project_id,
            [hash.id],
            event.event_id,
        )

        result = serialize(hash, user=user)
        assert result['latestEvent'] is None
Exemple #7
0
    def _save_aggregate(self, event, hashes, release, **kwargs):
        project = event.project

        # attempt to find a matching hash
        all_hashes = self._find_hashes(project, hashes)

        existing_group_id = None
        for h in all_hashes:
            if h.group_id is not None:
                existing_group_id = h.group_id
                break
            if h.group_tombstone_id is not None:
                raise HashDiscarded('Matches group tombstone %s' % h.group_tombstone_id)

        # XXX(dcramer): this has the opportunity to create duplicate groups
        # it should be resolved by the hash merging function later but this
        # should be better tested/reviewed
        if existing_group_id is None:
            kwargs['score'] = ScoreClause.calculate(1, kwargs['last_seen'])
            # it's possible the release was deleted between
            # when we queried for the release and now, so
            # make sure it still exists
            first_release = kwargs.pop('first_release', None)

            with transaction.atomic():
                short_id = project.next_short_id()
                group, group_is_new = Group.objects.create(
                    project=project,
                    short_id=short_id,
                    first_release_id=Release.objects.filter(
                        id=first_release.id,
                    ).values_list('id', flat=True).first() if first_release else None,
                    **kwargs
                ), True

            metrics.incr(
                'group.created',
                skip_internal=True,
                tags={'platform': event.platform or 'unknown'}
            )

        else:
            group = Group.objects.get(id=existing_group_id)

            group_is_new = False

        # Keep a set of all of the hashes that are relevant for this event and
        # belong to the destination group so that we can record this as the
        # last processed event for each. (We can't just update every
        # ``GroupHash`` instance, since we only want to record this for events
        # that not only include the hash but were also placed into the
        # associated group.)
        relevant_group_hashes = set(
            [instance for instance in all_hashes if instance.group_id == group.id]
        )

        # If all hashes are brand new we treat this event as new
        is_new = False
        new_hashes = [h for h in all_hashes if h.group_id is None]
        if new_hashes:
            # XXX: There is a race condition here wherein another process could
            # create a new group that is associated with one of the new hashes,
            # add some event(s) to it, and then subsequently have the hash
            # "stolen" by this process. This then "orphans" those events from
            # their "siblings" in the group we've created here. We don't have a
            # way to fix this, since we can't call `_ensure_hashes_merged`
            # without filtering on `group_id` (which we can't do due to query
            # planner weirdness.) For more context, see 84c6f75a and d0e22787,
            # as well as GH-5085.
            GroupHash.objects.filter(
                id__in=[h.id for h in new_hashes],
            ).exclude(
                state=GroupHash.State.LOCKED_IN_MIGRATION,
            ).update(group=group)

            if group_is_new and len(new_hashes) == len(all_hashes):
                is_new = True

            # XXX: This can lead to invalid results due to a race condition and
            # lack of referential integrity enforcement, see above comment(s)
            # about "hash stealing".
            relevant_group_hashes.update(new_hashes)

        # XXX(dcramer): it's important this gets called **before** the aggregate
        # is processed as otherwise values like last_seen will get mutated
        can_sample = (
            features.has('projects:sample-events', project=project) and should_sample(
                event.data.get('received') or float(event.datetime.strftime('%s')),
                group.data.get('last_received') or float(group.last_seen.strftime('%s')),
                group.times_seen,
            )
        )

        if not is_new:
            is_regression = self._process_existing_aggregate(
                group=group,
                event=event,
                data=kwargs,
                release=release,
            )
        else:
            is_regression = False

        # Determine if we've sampled enough data to store this event
        if is_new or is_regression:
            is_sample = False
        else:
            is_sample = can_sample

        if not is_sample:
            GroupHash.record_last_processed_event_id(
                project.id,
                [h.id for h in relevant_group_hashes],
                event.event_id,
            )

        return group, is_new, is_regression, is_sample
Exemple #8
0
    def _save_aggregate(self, event, hashes, release, **kwargs):
        project = event.project

        # attempt to find a matching hash
        all_hashes = self._find_hashes(project, hashes)

        existing_group_id = None
        for h in all_hashes:
            if h.group_id is not None:
                existing_group_id = h.group_id
                break
            if h.group_tombstone_id is not None:
                raise HashDiscarded('Matches group tombstone %s' %
                                    h.group_tombstone_id)

        # XXX(dcramer): this has the opportunity to create duplicate groups
        # it should be resolved by the hash merging function later but this
        # should be better tested/reviewed
        if existing_group_id is None:
            kwargs['score'] = ScoreClause.calculate(1, kwargs['last_seen'])
            # it's possible the release was deleted between
            # when we queried for the release and now, so
            # make sure it still exists
            first_release = kwargs.pop('first_release', None)

            with transaction.atomic():
                short_id = project.next_short_id()
                group, group_is_new = Group.objects.create(
                    project=project,
                    short_id=short_id,
                    first_release_id=Release.objects.filter(
                        id=first_release.id, ).values_list('id',
                                                           flat=True).first()
                    if first_release else None,
                    **kwargs), True

            metrics.incr('group.created',
                         skip_internal=True,
                         tags={'platform': event.platform or 'unknown'})

        else:
            group = Group.objects.get(id=existing_group_id)

            group_is_new = False

        # If all hashes are brand new we treat this event as new
        is_new = False
        new_hashes = [h for h in all_hashes if h.group_id is None]
        if new_hashes:
            # XXX: There is a race condition here wherein another process could
            # create a new group that is associated with one of the new hashes,
            # add some event(s) to it, and then subsequently have the hash
            # "stolen" by this process. This then "orphans" those events from
            # their "siblings" in the group we've created here. We don't have a
            # way to fix this, since we can't call `_ensure_hashes_merged`
            # without filtering on `group_id` (which we can't do due to query
            # planner weirdness.) For more context, see 84c6f75a and d0e22787,
            # as well as GH-5085.
            GroupHash.objects.filter(id__in=[
                h.id for h in new_hashes
            ], ).exclude(state=GroupHash.State.LOCKED_IN_MIGRATION, ).update(
                group=group)

            if group_is_new and len(new_hashes) == len(all_hashes):
                is_new = True

        # XXX(dcramer): it's important this gets called **before** the aggregate
        # is processed as otherwise values like last_seen will get mutated
        can_sample = (features.has('projects:sample-events', project=project)
                      and should_sample(
                          event.data.get('received')
                          or float(event.datetime.strftime('%s')),
                          group.data.get('last_received')
                          or float(group.last_seen.strftime('%s')),
                          group.times_seen,
                      ))

        if not is_new:
            is_regression = self._process_existing_aggregate(
                group=group,
                event=event,
                data=kwargs,
                release=release,
            )
        else:
            is_regression = False

        # Determine if we've sampled enough data to store this event
        if is_new or is_regression:
            is_sample = False
        else:
            is_sample = can_sample

        if not is_sample:
            GroupHash.record_last_processed_event_id(
                all_hashes[0].id,
                event.event_id,
            )

        return group, is_new, is_regression, is_sample
Exemple #9
0
    def _save_aggregate(self, event, hashes, release, **kwargs):
        project = event.project

        # attempt to find a matching hash
        all_hashes = self._find_hashes(project, hashes)

        try:
            existing_group_id = six.next(h.group_id for h in all_hashes
                                         if h.group_id is not None)
        except StopIteration:
            existing_group_id = None

        # XXX(dcramer): this has the opportunity to create duplicate groups
        # it should be resolved by the hash merging function later but this
        # should be better tested/reviewed
        if existing_group_id is None:
            kwargs['score'] = ScoreClause.calculate(1, kwargs['last_seen'])
            with transaction.atomic():
                short_id = project.next_short_id()
                group, group_is_new = Group.objects.create(project=project,
                                                           short_id=short_id,
                                                           **kwargs), True
        else:
            group = Group.objects.get(id=existing_group_id)

            group_is_new = False

        # Keep a set of all of the hashes that are relevant for this event and
        # belong to the destination group so that we can record this as the
        # last processed event for each. (We can't just update every
        # ``GroupHash`` instance, since we only want to record this for events
        # that not only include the hash but were also placed into the
        # associated group.)
        relevant_group_hashes = set([
            instance for instance in all_hashes
            if instance.group_id == group.id
        ])

        # If all hashes are brand new we treat this event as new
        is_new = False
        new_hashes = [h for h in all_hashes if h.group_id is None]
        if new_hashes:
            # XXX: There is a race condition here wherein another process could
            # create a new group that is associated with one of the new hashes,
            # add some event(s) to it, and then subsequently have the hash
            # "stolen" by this process. This then "orphans" those events from
            # their "siblings" in the group we've created here. We don't have a
            # way to fix this, since we can't call `_ensure_hashes_merged`
            # without filtering on `group_id` (which we can't do due to query
            # planner weirdness.) For more context, see 84c6f75a and d0e22787,
            # as well as GH-5085.
            GroupHash.objects.filter(id__in=[h.id for h in new_hashes
                                             ], ).update(group=group)

            if group_is_new and len(new_hashes) == len(all_hashes):
                is_new = True

            # XXX: This can lead to invalid results due to a race condition and
            # lack of referential integrity enforcement, see above comment(s)
            # about "hash stealing".
            relevant_group_hashes.update(new_hashes)

        # XXX(dcramer): it's important this gets called **before** the aggregate
        # is processed as otherwise values like last_seen will get mutated
        can_sample = (features.has('projects:sample-events', project=project)
                      and should_sample(
                          event.data.get('received')
                          or float(event.datetime.strftime('%s')),
                          group.data.get('last_received')
                          or float(group.last_seen.strftime('%s')),
                          group.times_seen,
                      ))

        if not is_new:
            is_regression = self._process_existing_aggregate(
                group=group,
                event=event,
                data=kwargs,
                release=release,
            )
        else:
            is_regression = False

        # Determine if we've sampled enough data to store this event
        if is_new or is_regression:
            is_sample = False
        else:
            is_sample = can_sample

        if not is_sample:
            GroupHash.record_last_processed_event_id(
                project.id,
                [h.id for h in relevant_group_hashes],
                event.event_id,
            )

        return group, is_new, is_regression, is_sample