Beispiel #1
0
    def test_unmerge(self):
        now = before_now(minutes=5).replace(microsecond=0, tzinfo=pytz.utc)

        def time_from_now(offset=0):
            return now + timedelta(seconds=offset)

        project = self.create_project()

        sequence = itertools.count(0)
        tag_values = itertools.cycle(["red", "green", "blue"])
        user_values = itertools.cycle([{"id": 1}, {"id": 2}])

        def create_message_event(template, parameters, environment, release, fingerprint="group1"):
            i = next(sequence)

            event_id = uuid.UUID(fields=(i, 0x0, 0x1000, 0x80, 0x80, 0x808080808080)).hex

            tags = [["color", next(tag_values)]]

            if release:
                tags.append(["sentry:release", release])

            event = self.store_event(
                data={
                    "event_id": event_id,
                    "message": template % parameters,
                    "type": "default",
                    "user": next(user_values),
                    "tags": tags,
                    "fingerprint": [fingerprint],
                    "timestamp": iso_format(now + timedelta(seconds=i)),
                    "environment": environment,
                    "release": release,
                },
                project_id=project.id,
            )

            UserReport.objects.create(
                project_id=project.id,
                group_id=event.group.id,
                event_id=event_id,
                name="Log Hat",
                email="*****@*****.**",
                comments="Quack",
            )

            features.record([event])

            return event

        events = OrderedDict()

        for event in (
            create_message_event(
                "This is message #%s.", i, environment="production", release="version"
            )
            for i in xrange(10)
        ):
            events.setdefault(get_fingerprint(event), []).append(event)

        for event in (
            create_message_event(
                "This is message #%s!",
                i,
                environment="production",
                release="version2",
                fingerprint="group2",
            )
            for i in xrange(10, 16)
        ):
            events.setdefault(get_fingerprint(event), []).append(event)

        event = create_message_event(
            "This is message #%s!",
            17,
            environment="staging",
            release="version3",
            fingerprint="group3",
        )

        events.setdefault(get_fingerprint(event), []).append(event)

        merge_source, source, destination = list(Group.objects.all())

        assert len(events) == 3
        assert sum(map(len, events.values())) == 17

        production_environment = Environment.objects.get(
            organization_id=project.organization_id, name="production"
        )

        with self.tasks():
            eventstream_state = eventstream.start_merge(project.id, [merge_source.id], source.id)
            merge_groups.delay([merge_source.id], source.id)
            eventstream.end_merge(eventstream_state)

        assert set(
            [
                (gtv.value, gtv.times_seen)
                for gtv in tagstore.get_group_tag_values(
                    project.id, source.id, production_environment.id, "color"
                )
            ]
        ) == set([("red", 6), ("green", 5), ("blue", 5)])

        similar_items = features.compare(source)
        assert len(similar_items) == 2
        assert similar_items[0][0] == source.id
        assert similar_items[0][1]["message:message:character-shingles"] == 1.0
        assert similar_items[1][0] == destination.id
        assert similar_items[1][1]["message:message:character-shingles"] < 1.0

        with self.tasks():
            eventstream_state = eventstream.start_unmerge(
                project.id, [list(events.keys())[0]], source.id, destination.id
            )
            unmerge.delay(
                project.id, source.id, destination.id, [list(events.keys())[0]], None, batch_size=5
            )
            eventstream.end_unmerge(eventstream_state)

        assert (
            list(
                Group.objects.filter(id=merge_source.id).values_list(
                    "times_seen", "first_seen", "last_seen"
                )
            )
            == []
        )

        assert list(
            Group.objects.filter(id=source.id).values_list("times_seen", "first_seen", "last_seen")
        ) == [(6, time_from_now(10), time_from_now(15))]

        assert list(
            Group.objects.filter(id=destination.id).values_list(
                "times_seen", "first_seen", "last_seen"
            )
        ) == [(11, time_from_now(0), time_from_now(16))]

        assert source.id != destination.id
        assert source.project == destination.project

        destination_event_ids = map(lambda event: event.event_id, list(events.values())[1])

        assert set(
            UserReport.objects.filter(group_id=source.id).values_list("event_id", flat=True)
        ) == set(destination_event_ids)

        assert set(
            GroupHash.objects.filter(group_id=source.id).values_list("hash", flat=True)
        ) == set(itertools.islice(events.keys(), 2))

        assert set(
            GroupRelease.objects.filter(group_id=source.id).values_list(
                "environment", "first_seen", "last_seen"
            )
        ) == set([(u"production", time_from_now(10), time_from_now(15))])

        assert set(
            [
                (gtv.value, gtv.times_seen)
                for gtv in tagstore.get_group_tag_values(
                    project.id, destination.id, production_environment.id, "color"
                )
            ]
        ) == set([(u"red", 4), (u"green", 3), (u"blue", 3)])

        destination_event_ids = map(
            lambda event: event.event_id, list(events.values())[0] + list(events.values())[2]
        )

        assert set(
            UserReport.objects.filter(group_id=destination.id).values_list("event_id", flat=True)
        ) == set(destination_event_ids)

        assert set(
            GroupHash.objects.filter(group_id=destination.id).values_list("hash", flat=True)
        ) == set(itertools.islice(events.keys(), 2, 3))

        assert set(
            GroupRelease.objects.filter(group_id=destination.id).values_list(
                "environment", "first_seen", "last_seen"
            )
        ) == set(
            [
                ("production", time_from_now(0), time_from_now(9)),
                ("staging", time_from_now(16), time_from_now(16)),
            ]
        )

        assert set(
            [
                (gtk.value, gtk.times_seen)
                for gtk in tagstore.get_group_tag_values(
                    project.id, destination.id, production_environment.id, "color"
                )
            ]
        ) == set([("red", 4), ("blue", 3), ("green", 3)])

        rollup_duration = 3600

        time_series = tsdb.get_range(
            tsdb.models.group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
        )

        environment_time_series = tsdb.get_range(
            tsdb.models.group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
            environment_ids=[production_environment.id],
        )

        def get_expected_series_values(rollup, events, function=None):
            if function is None:

                def function(aggregate, event):
                    return (aggregate if aggregate is not None else 0) + 1

            expected = {}
            for event in events:
                k = float((to_timestamp(event.datetime) // rollup_duration) * rollup_duration)
                expected[k] = function(expected.get(k), event)

            return expected

        def assert_series_contains(expected, actual, default=0):
            actual = dict(actual)

            for key, value in expected.items():
                assert actual.get(key, 0) == value

            for key in set(actual.keys()) - set(expected.keys()):
                assert actual.get(key, 0) == default

        assert_series_contains(
            get_expected_series_values(rollup_duration, list(events.values())[1]),
            time_series[source.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[0] + list(events.values())[2]
            ),
            time_series[destination.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(rollup_duration, list(events.values())[1]),
            environment_time_series[source.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[0][:-1] + list(events.values())[2]
            ),
            environment_time_series[destination.id],
            0,
        )

        time_series = tsdb.get_distinct_counts_series(
            tsdb.models.users_affected_by_group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
        )

        environment_time_series = tsdb.get_distinct_counts_series(
            tsdb.models.users_affected_by_group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
            environment_id=production_environment.id,
        )

        def collect_by_user_tag(aggregate, event):
            aggregate = aggregate if aggregate is not None else set()
            aggregate.add(get_event_user_from_interface(event.data["user"]).tag_value)
            return aggregate

        for series in [time_series, environment_time_series]:
            assert_series_contains(
                {
                    timestamp: len(values)
                    for timestamp, values in get_expected_series_values(
                        rollup_duration, list(events.values())[1], collect_by_user_tag
                    ).items()
                },
                series[source.id],
            )

            assert_series_contains(
                {
                    timestamp: len(values)
                    for timestamp, values in get_expected_series_values(
                        rollup_duration,
                        list(events.values())[0] + list(events.values())[2],
                        collect_by_user_tag,
                    ).items()
                },
                time_series[destination.id],
            )

        def strip_zeroes(data):
            for group_id, series in data.items():
                for _, values in series:
                    for key, val in list(values.items()):
                        if val == 0:
                            values.pop(key)

            return data

        def collect_by_release(group, aggregate, event):
            aggregate = aggregate if aggregate is not None else {}
            release = event.get_tag("sentry:release")
            if not release:
                return aggregate
            release = GroupRelease.objects.get(
                group_id=group.id,
                environment=event.data["environment"],
                release_id=Release.objects.get(
                    organization_id=project.organization_id, version=release
                ).id,
            ).id
            aggregate[release] = aggregate.get(release, 0) + 1
            return aggregate

        items = {}
        for i in [source.id, destination.id]:
            items[i] = list(GroupRelease.objects.filter(group_id=i).values_list("id", flat=True))

        time_series = strip_zeroes(
            tsdb.get_frequency_series(
                tsdb.models.frequent_releases_by_group,
                items,
                now - timedelta(seconds=rollup_duration),
                time_from_now(17),
                rollup_duration,
            )
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[1],
                functools.partial(collect_by_release, source),
            ),
            time_series[source.id],
            {},
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[0] + list(events.values())[2],
                functools.partial(collect_by_release, destination),
            ),
            time_series[destination.id],
            {},
        )

        items = {}
        for i in [source.id, destination.id]:
            items[i] = list(Environment.objects.all().values_list("id", flat=True))

        time_series = strip_zeroes(
            tsdb.get_frequency_series(
                tsdb.models.frequent_environments_by_group,
                items,
                now - timedelta(seconds=rollup_duration),
                time_from_now(17),
                rollup_duration,
            )
        )

        def collect_by_environment(aggregate, event):
            aggregate = aggregate if aggregate is not None else {}
            environment = Environment.objects.get(
                organization_id=project.organization_id, name=event.data["environment"]
            ).id
            aggregate[environment] = aggregate.get(environment, 0) + 1
            return aggregate

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[1], collect_by_environment
            ),
            time_series[source.id],
            {},
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[0] + list(events.values())[2],
                collect_by_environment,
            ),
            time_series[destination.id],
            {},
        )

        source_similar_items = features.compare(source)
        assert source_similar_items[0] == (
            source.id,
            {
                "exception:message:character-shingles": None,
                "exception:stacktrace:application-chunks": None,
                "exception:stacktrace:pairs": None,
                "message:message:character-shingles": 1.0,
            },
        )
        assert source_similar_items[1][0] == destination.id
        assert source_similar_items[1][1]["message:message:character-shingles"] < 1.0

        destination_similar_items = features.compare(destination)
        assert destination_similar_items[0] == (
            destination.id,
            {
                "exception:message:character-shingles": None,
                "exception:stacktrace:application-chunks": None,
                "exception:stacktrace:pairs": None,
                "message:message:character-shingles": 1.0,
            },
        )
        assert destination_similar_items[1][0] == source.id
        assert destination_similar_items[1][1]["message:message:character-shingles"] < 1.0
Beispiel #2
0
def migrate_events(caches, project, source_id, destination_id,
                   fingerprints, events, actor_id, eventstream_state):
    # XXX: This is only actually able to create a destination group and migrate
    # the group hashes if there are events that can be migrated. How do we
    # handle this if there aren't any events? We can't create a group (there
    # isn't any data to derive the aggregates from), so we'd have to mark the
    # hash as in limbo somehow...?)
    if not events:
        return (destination_id, eventstream_state)

    if destination_id is None:
        # XXX: There is a race condition here between the (wall clock) time
        # that the migration is started by the user and when we actually
        # get to this block where the new destination is created and we've
        # moved the ``GroupHash`` so that events start being associated
        # with it. During this gap, there could have been additional events
        # ingested, and if we want to handle this, we'd need to record the
        # highest event ID we've seen at the beginning of the migration,
        # then scan all events greater than that ID and migrate the ones
        # where necessary. (This still isn't even guaranteed to catch all
        # of the events due to processing latency, but it's a better shot.)
        # Create a new destination group.
        destination = Group.objects.create(
            project_id=project.id,
            short_id=project.next_short_id(),
            **get_group_creation_attributes(caches, events)
        )

        destination_id = destination.id

        eventstream_state = eventstream.start_unmerge(
            project.id,
            fingerprints,
            source_id,
            destination_id
        )

        # Move the group hashes to the destination.
        GroupHash.objects.filter(
            project_id=project.id,
            hash__in=fingerprints,
        ).update(group=destination_id)

        # Create activity records for the source and destination group.
        Activity.objects.create(
            project_id=project.id,
            group_id=destination_id,
            type=Activity.UNMERGE_DESTINATION,
            user_id=actor_id,
            data={
                'fingerprints': fingerprints,
                'source_id': source_id,
            },
        )

        Activity.objects.create(
            project_id=project.id,
            group_id=source_id,
            type=Activity.UNMERGE_SOURCE,
            user_id=actor_id,
            data={
                'fingerprints': fingerprints,
                'destination_id': destination_id,
            },
        )
    else:
        # Update the existing destination group.
        destination = Group.objects.get(id=destination_id)
        destination.update(**get_group_backfill_attributes(caches, destination, events))

    event_id_set = set(event.id for event in events)

    Event.objects.filter(
        project_id=project.id,
        id__in=event_id_set,
    ).update(group_id=destination_id)

    for event in events:
        event.group = destination

    tagstore.update_group_for_events(
        project_id=project.id,
        event_ids=event_id_set,
        destination_id=destination_id
    )

    event_event_id_set = set(event.event_id for event in events)

    EventMapping.objects.filter(
        project_id=project.id,
        event_id__in=event_event_id_set,
    ).update(group_id=destination_id)

    UserReport.objects.filter(
        project_id=project.id,
        event_id__in=event_event_id_set,
    ).update(group=destination_id)

    return (destination.id, eventstream_state)
Beispiel #3
0
def migrate_events(caches, project, source_id, destination_id, fingerprints,
                   events, actor_id, eventstream_state):
    # XXX: This is only actually able to create a destination group and migrate
    # the group hashes if there are events that can be migrated. How do we
    # handle this if there aren't any events? We can't create a group (there
    # isn't any data to derive the aggregates from), so we'd have to mark the
    # hash as in limbo somehow...?)
    if not events:
        return (destination_id, eventstream_state)

    if destination_id is None:
        # XXX: There is a race condition here between the (wall clock) time
        # that the migration is started by the user and when we actually
        # get to this block where the new destination is created and we've
        # moved the ``GroupHash`` so that events start being associated
        # with it. During this gap, there could have been additional events
        # ingested, and if we want to handle this, we'd need to record the
        # highest event ID we've seen at the beginning of the migration,
        # then scan all events greater than that ID and migrate the ones
        # where necessary. (This still isn't even guaranteed to catch all
        # of the events due to processing latency, but it's a better shot.)
        # Create a new destination group.
        destination = Group.objects.create(project_id=project.id,
                                           short_id=project.next_short_id(),
                                           **get_group_creation_attributes(
                                               caches, events))

        destination_id = destination.id

        eventstream_state = eventstream.start_unmerge(project.id, fingerprints,
                                                      source_id,
                                                      destination_id)

        # Move the group hashes to the destination.
        GroupHash.objects.filter(
            project_id=project.id,
            hash__in=fingerprints).update(group=destination_id)

        # Create activity records for the source and destination group.
        Activity.objects.create(
            project_id=project.id,
            group_id=destination_id,
            type=Activity.UNMERGE_DESTINATION,
            user_id=actor_id,
            data={
                "fingerprints": fingerprints,
                "source_id": source_id
            },
        )

        Activity.objects.create(
            project_id=project.id,
            group_id=source_id,
            type=Activity.UNMERGE_SOURCE,
            user_id=actor_id,
            data={
                "fingerprints": fingerprints,
                "destination_id": destination_id
            },
        )
    else:
        # Update the existing destination group.
        destination = Group.objects.get(id=destination_id)
        destination.update(
            **get_group_backfill_attributes(caches, destination, events))

    event_id_set = set(event.id for event in events)

    Event.objects.filter(project_id=project.id,
                         id__in=event_id_set).update(group_id=destination_id)

    for event in events:
        event.group = destination

    tagstore.update_group_for_events(project_id=project.id,
                                     event_ids=event_id_set,
                                     destination_id=destination_id)

    event_event_id_set = set(event.event_id for event in events)

    UserReport.objects.filter(
        project_id=project.id,
        event_id__in=event_event_id_set).update(group=destination_id)

    return (destination.id, eventstream_state)
Beispiel #4
0
 def start_snuba_replacement(self, project: Project, source_id: int,
                             destination_id: int) -> EventstreamState:
     return eventstream.start_unmerge(project.id, self.fingerprints,
                                      source_id, destination_id)