コード例 #1
0
ファイル: test_unmerge.py プロジェクト: rmr1154/sentry
    def test_unmerge(self):
        now = before_now(minutes=5).replace(microsecond=0, tzinfo=pytz.utc)

        def time_from_now(offset=0):
            return now + timedelta(seconds=offset)

        project = self.create_project()

        sequence = itertools.count(0)
        tag_values = itertools.cycle(["red", "green", "blue"])
        user_values = itertools.cycle([{"id": 1}, {"id": 2}])

        def create_message_event(template, parameters, environment, release, fingerprint="group1"):
            i = next(sequence)

            event_id = uuid.UUID(fields=(i, 0x0, 0x1000, 0x80, 0x80, 0x808080808080)).hex

            tags = [["color", next(tag_values)]]

            if release:
                tags.append(["sentry:release", release])

            event = self.store_event(
                data={
                    "event_id": event_id,
                    "message": template % parameters,
                    "type": "default",
                    "user": next(user_values),
                    "tags": tags,
                    "fingerprint": [fingerprint],
                    "timestamp": iso_format(now + timedelta(seconds=i)),
                    "environment": environment,
                    "release": release,
                },
                project_id=project.id,
            )

            UserReport.objects.create(
                project_id=project.id,
                group_id=event.group.id,
                event_id=event_id,
                name="Log Hat",
                email="*****@*****.**",
                comments="Quack",
            )

            features.record([event])

            return event

        events = OrderedDict()

        for event in (
            create_message_event(
                "This is message #%s.", i, environment="production", release="version"
            )
            for i in xrange(10)
        ):
            events.setdefault(get_fingerprint(event), []).append(event)

        for event in (
            create_message_event(
                "This is message #%s!",
                i,
                environment="production",
                release="version2",
                fingerprint="group2",
            )
            for i in xrange(10, 16)
        ):
            events.setdefault(get_fingerprint(event), []).append(event)

        event = create_message_event(
            "This is message #%s!",
            17,
            environment="staging",
            release="version3",
            fingerprint="group3",
        )

        events.setdefault(get_fingerprint(event), []).append(event)

        merge_source, source, destination = list(Group.objects.all())

        assert len(events) == 3
        assert sum(map(len, events.values())) == 17

        production_environment = Environment.objects.get(
            organization_id=project.organization_id, name="production"
        )

        with self.tasks():
            eventstream_state = eventstream.start_merge(project.id, [merge_source.id], source.id)
            merge_groups.delay([merge_source.id], source.id)
            eventstream.end_merge(eventstream_state)

        assert set(
            [
                (gtv.value, gtv.times_seen)
                for gtv in tagstore.get_group_tag_values(
                    project.id, source.id, production_environment.id, "color"
                )
            ]
        ) == set([("red", 6), ("green", 5), ("blue", 5)])

        similar_items = features.compare(source)
        assert len(similar_items) == 2
        assert similar_items[0][0] == source.id
        assert similar_items[0][1]["message:message:character-shingles"] == 1.0
        assert similar_items[1][0] == destination.id
        assert similar_items[1][1]["message:message:character-shingles"] < 1.0

        with self.tasks():
            eventstream_state = eventstream.start_unmerge(
                project.id, [list(events.keys())[0]], source.id, destination.id
            )
            unmerge.delay(
                project.id, source.id, destination.id, [list(events.keys())[0]], None, batch_size=5
            )
            eventstream.end_unmerge(eventstream_state)

        assert (
            list(
                Group.objects.filter(id=merge_source.id).values_list(
                    "times_seen", "first_seen", "last_seen"
                )
            )
            == []
        )

        assert list(
            Group.objects.filter(id=source.id).values_list("times_seen", "first_seen", "last_seen")
        ) == [(6, time_from_now(10), time_from_now(15))]

        assert list(
            Group.objects.filter(id=destination.id).values_list(
                "times_seen", "first_seen", "last_seen"
            )
        ) == [(11, time_from_now(0), time_from_now(16))]

        assert source.id != destination.id
        assert source.project == destination.project

        destination_event_ids = map(lambda event: event.event_id, list(events.values())[1])

        assert set(
            UserReport.objects.filter(group_id=source.id).values_list("event_id", flat=True)
        ) == set(destination_event_ids)

        assert set(
            GroupHash.objects.filter(group_id=source.id).values_list("hash", flat=True)
        ) == set(itertools.islice(events.keys(), 2))

        assert set(
            GroupRelease.objects.filter(group_id=source.id).values_list(
                "environment", "first_seen", "last_seen"
            )
        ) == set([(u"production", time_from_now(10), time_from_now(15))])

        assert set(
            [
                (gtv.value, gtv.times_seen)
                for gtv in tagstore.get_group_tag_values(
                    project.id, destination.id, production_environment.id, "color"
                )
            ]
        ) == set([(u"red", 4), (u"green", 3), (u"blue", 3)])

        destination_event_ids = map(
            lambda event: event.event_id, list(events.values())[0] + list(events.values())[2]
        )

        assert set(
            UserReport.objects.filter(group_id=destination.id).values_list("event_id", flat=True)
        ) == set(destination_event_ids)

        assert set(
            GroupHash.objects.filter(group_id=destination.id).values_list("hash", flat=True)
        ) == set(itertools.islice(events.keys(), 2, 3))

        assert set(
            GroupRelease.objects.filter(group_id=destination.id).values_list(
                "environment", "first_seen", "last_seen"
            )
        ) == set(
            [
                ("production", time_from_now(0), time_from_now(9)),
                ("staging", time_from_now(16), time_from_now(16)),
            ]
        )

        assert set(
            [
                (gtk.value, gtk.times_seen)
                for gtk in tagstore.get_group_tag_values(
                    project.id, destination.id, production_environment.id, "color"
                )
            ]
        ) == set([("red", 4), ("blue", 3), ("green", 3)])

        rollup_duration = 3600

        time_series = tsdb.get_range(
            tsdb.models.group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
        )

        environment_time_series = tsdb.get_range(
            tsdb.models.group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
            environment_ids=[production_environment.id],
        )

        def get_expected_series_values(rollup, events, function=None):
            if function is None:

                def function(aggregate, event):
                    return (aggregate if aggregate is not None else 0) + 1

            expected = {}
            for event in events:
                k = float((to_timestamp(event.datetime) // rollup_duration) * rollup_duration)
                expected[k] = function(expected.get(k), event)

            return expected

        def assert_series_contains(expected, actual, default=0):
            actual = dict(actual)

            for key, value in expected.items():
                assert actual.get(key, 0) == value

            for key in set(actual.keys()) - set(expected.keys()):
                assert actual.get(key, 0) == default

        assert_series_contains(
            get_expected_series_values(rollup_duration, list(events.values())[1]),
            time_series[source.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[0] + list(events.values())[2]
            ),
            time_series[destination.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(rollup_duration, list(events.values())[1]),
            environment_time_series[source.id],
            0,
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[0][:-1] + list(events.values())[2]
            ),
            environment_time_series[destination.id],
            0,
        )

        time_series = tsdb.get_distinct_counts_series(
            tsdb.models.users_affected_by_group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
        )

        environment_time_series = tsdb.get_distinct_counts_series(
            tsdb.models.users_affected_by_group,
            [source.id, destination.id],
            now - timedelta(seconds=rollup_duration),
            time_from_now(17),
            rollup_duration,
            environment_id=production_environment.id,
        )

        def collect_by_user_tag(aggregate, event):
            aggregate = aggregate if aggregate is not None else set()
            aggregate.add(get_event_user_from_interface(event.data["user"]).tag_value)
            return aggregate

        for series in [time_series, environment_time_series]:
            assert_series_contains(
                {
                    timestamp: len(values)
                    for timestamp, values in get_expected_series_values(
                        rollup_duration, list(events.values())[1], collect_by_user_tag
                    ).items()
                },
                series[source.id],
            )

            assert_series_contains(
                {
                    timestamp: len(values)
                    for timestamp, values in get_expected_series_values(
                        rollup_duration,
                        list(events.values())[0] + list(events.values())[2],
                        collect_by_user_tag,
                    ).items()
                },
                time_series[destination.id],
            )

        def strip_zeroes(data):
            for group_id, series in data.items():
                for _, values in series:
                    for key, val in list(values.items()):
                        if val == 0:
                            values.pop(key)

            return data

        def collect_by_release(group, aggregate, event):
            aggregate = aggregate if aggregate is not None else {}
            release = event.get_tag("sentry:release")
            if not release:
                return aggregate
            release = GroupRelease.objects.get(
                group_id=group.id,
                environment=event.data["environment"],
                release_id=Release.objects.get(
                    organization_id=project.organization_id, version=release
                ).id,
            ).id
            aggregate[release] = aggregate.get(release, 0) + 1
            return aggregate

        items = {}
        for i in [source.id, destination.id]:
            items[i] = list(GroupRelease.objects.filter(group_id=i).values_list("id", flat=True))

        time_series = strip_zeroes(
            tsdb.get_frequency_series(
                tsdb.models.frequent_releases_by_group,
                items,
                now - timedelta(seconds=rollup_duration),
                time_from_now(17),
                rollup_duration,
            )
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[1],
                functools.partial(collect_by_release, source),
            ),
            time_series[source.id],
            {},
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[0] + list(events.values())[2],
                functools.partial(collect_by_release, destination),
            ),
            time_series[destination.id],
            {},
        )

        items = {}
        for i in [source.id, destination.id]:
            items[i] = list(Environment.objects.all().values_list("id", flat=True))

        time_series = strip_zeroes(
            tsdb.get_frequency_series(
                tsdb.models.frequent_environments_by_group,
                items,
                now - timedelta(seconds=rollup_duration),
                time_from_now(17),
                rollup_duration,
            )
        )

        def collect_by_environment(aggregate, event):
            aggregate = aggregate if aggregate is not None else {}
            environment = Environment.objects.get(
                organization_id=project.organization_id, name=event.data["environment"]
            ).id
            aggregate[environment] = aggregate.get(environment, 0) + 1
            return aggregate

        assert_series_contains(
            get_expected_series_values(
                rollup_duration, list(events.values())[1], collect_by_environment
            ),
            time_series[source.id],
            {},
        )

        assert_series_contains(
            get_expected_series_values(
                rollup_duration,
                list(events.values())[0] + list(events.values())[2],
                collect_by_environment,
            ),
            time_series[destination.id],
            {},
        )

        source_similar_items = features.compare(source)
        assert source_similar_items[0] == (
            source.id,
            {
                "exception:message:character-shingles": None,
                "exception:stacktrace:application-chunks": None,
                "exception:stacktrace:pairs": None,
                "message:message:character-shingles": 1.0,
            },
        )
        assert source_similar_items[1][0] == destination.id
        assert source_similar_items[1][1]["message:message:character-shingles"] < 1.0

        destination_similar_items = features.compare(destination)
        assert destination_similar_items[0] == (
            destination.id,
            {
                "exception:message:character-shingles": None,
                "exception:stacktrace:application-chunks": None,
                "exception:stacktrace:pairs": None,
                "message:message:character-shingles": 1.0,
            },
        )
        assert destination_similar_items[1][0] == source.id
        assert destination_similar_items[1][1]["message:message:character-shingles"] < 1.0
コード例 #2
0
ファイル: unmerge.py プロジェクト: zeuskingzb/sentry
def unmerge(
    project_id,
    source_id,
    destination_id,
    fingerprints,
    actor_id,
    cursor=None,
    batch_size=500,
    source_fields_reset=False,
    eventstream_state=None,
):
    # XXX: The queryset chunking logic below is awfully similar to
    # ``RangeQuerySetWrapper``. Ideally that could be refactored to be able to
    # be run without iteration by passing around a state object and we could
    # just use that here instead.

    source = Group.objects.get(project_id=project_id, id=source_id)

    # On the first iteration of this loop, we clear out all of the
    # denormalizations from the source group so that we can have a clean slate
    # for the new, repaired data.
    if cursor is None:
        fingerprints = lock_hashes(project_id, source_id, fingerprints)
        truncate_denormalizations(source)

    caches = get_caches()

    project = caches["Project"](project_id)

    # We fetch the events in descending order by their primary key to get the
    # best approximation of the most recently received events.
    queryset = Event.objects.filter(project_id=project_id,
                                    group_id=source_id).order_by("-id")

    if cursor is not None:
        queryset = queryset.filter(id__lt=cursor)

    events = list(queryset[:batch_size])

    # If there are no more events to process, we're done with the migration.
    if not events:
        tagstore.update_group_tag_key_values_seen(project_id,
                                                  [source_id, destination_id])
        unlock_hashes(project_id, fingerprints)

        logger.warning("Unmerge complete (eventstream state: %s)",
                       eventstream_state)
        if eventstream_state:
            eventstream.end_unmerge(eventstream_state)

        return destination_id

    Event.objects.bind_nodes(events, "data")

    source_events = []
    destination_events = []

    for event in events:
        (destination_events if get_fingerprint(event) in fingerprints else
         source_events).append(event)

    if source_events:
        if not source_fields_reset:
            source.update(
                **get_group_creation_attributes(caches, source_events))
            source_fields_reset = True
        else:
            source.update(
                **get_group_backfill_attributes(caches, source, source_events))

    (destination_id, eventstream_state) = migrate_events(
        caches,
        project,
        source_id,
        destination_id,
        fingerprints,
        destination_events,
        actor_id,
        eventstream_state,
    )

    repair_denormalizations(caches, project, events)

    unmerge.delay(
        project_id,
        source_id,
        destination_id,
        fingerprints,
        actor_id,
        cursor=events[-1].id,
        batch_size=batch_size,
        source_fields_reset=source_fields_reset,
        eventstream_state=eventstream_state,
    )
コード例 #3
0
def unmerge(
    project_id,
    source_id,
    destination_id,
    fingerprints,
    actor_id,
    last_event=None,
    batch_size=500,
    source_fields_reset=False,
    eventstream_state=None,
):
    source = Group.objects.get(project_id=project_id, id=source_id)

    caches = get_caches()

    project = caches["Project"](project_id)

    # On the first iteration of this loop, we clear out all of the
    # denormalizations from the source group so that we can have a clean slate
    # for the new, repaired data.
    if last_event is None:
        fingerprints = lock_hashes(project_id, source_id, fingerprints)
        truncate_denormalizations(project, source)

    last_event, events = celery_run_batch_query(
        filter=eventstore.Filter(project_ids=[project_id],
                                 group_ids=[source.id]),
        batch_size=batch_size,
        state=last_event,
        referrer="unmerge",
    )

    # If there are no more events to process, we're done with the migration.
    if not events:
        unlock_hashes(project_id, fingerprints)
        logger.warning("Unmerge complete (eventstream state: %s)",
                       eventstream_state)
        if eventstream_state:
            eventstream.end_unmerge(eventstream_state)

        return destination_id

    source_events = []
    destination_events = []

    for event in events:
        (destination_events if get_fingerprint(event) in fingerprints else
         source_events).append(event)

    if source_events:
        if not source_fields_reset:
            source.update(
                **get_group_creation_attributes(caches, source_events))
            source_fields_reset = True
        else:
            source.update(
                **get_group_backfill_attributes(caches, source, source_events))

    (destination_id, eventstream_state) = migrate_events(
        caches,
        project,
        source_id,
        destination_id,
        fingerprints,
        destination_events,
        actor_id,
        eventstream_state,
    )

    repair_denormalizations(caches, project, events)

    unmerge.delay(
        project_id,
        source_id,
        destination_id,
        fingerprints,
        actor_id,
        last_event=last_event,
        batch_size=batch_size,
        source_fields_reset=source_fields_reset,
        eventstream_state=eventstream_state,
    )
コード例 #4
0
def unmerge(
    project_id,
    source_id,
    destination_id,
    fingerprints,
    actor_id,
    last_event=None,
    batch_size=500,
    source_fields_reset=False,
    eventstream_state=None,
):
    # XXX: The queryset chunking logic below is awfully similar to
    # ``RangeQuerySetWrapper``. Ideally that could be refactored to be able to
    # be run without iteration by passing around a state object and we could
    # just use that here instead.

    source = Group.objects.get(project_id=project_id, id=source_id)

    # On the first iteration of this loop, we clear out all of the
    # denormalizations from the source group so that we can have a clean slate
    # for the new, repaired data.
    if last_event is None:
        fingerprints = lock_hashes(project_id, source_id, fingerprints)
        truncate_denormalizations(source)

    caches = get_caches()

    project = caches["Project"](project_id)

    # We process events sorted in descending order by -timestamp, -event_id. We need
    # to include event_id as well as timestamp in the ordering criteria since:
    #
    # - Event timestamps are rounded to the second so multiple events are likely
    # to have the same timestamp.
    #
    # - When sorting by timestamp alone, Snuba may not give us a deterministic
    # order for events with the same timestamp.
    #
    # - We need to ensure that we do not skip any events between batches. If we
    # only sorted by timestamp < last_event.timestamp it would be possible to
    # have missed an event with the same timestamp as the last item in the
    # previous batch.

    conditions = []
    if last_event is not None:
        conditions.extend(
            [
                ["timestamp", "<=", last_event["timestamp"]],
                [
                    ["timestamp", "<", last_event["timestamp"]],
                    ["event_id", "<", last_event["event_id"]],
                ],
            ]
        )

    events = eventstore.get_events(
        filter_keys={"project_id": [project_id], "issue": [source.id]},
        # We need the text-only "search message" from Snuba, not the raw message
        # dict field from nodestore.
        additional_columns=[eventstore.Columns.MESSAGE],
        conditions=conditions,
        limit=batch_size,
        referrer="unmerge",
        orderby=["-timestamp", "-event_id"],
    )

    # If there are no more events to process, we're done with the migration.
    if not events:
        tagstore.update_group_tag_key_values_seen(project_id, [source_id, destination_id])
        unlock_hashes(project_id, fingerprints)
        logger.warning("Unmerge complete (eventstream state: %s)", eventstream_state)
        if eventstream_state:
            eventstream.end_unmerge(eventstream_state)

        return destination_id

    Event.objects.bind_nodes(events, "data")

    source_events = []
    destination_events = []

    for event in events:
        (destination_events if get_fingerprint(event) in fingerprints else source_events).append(
            event
        )

    if source_events:
        if not source_fields_reset:
            source.update(**get_group_creation_attributes(caches, source_events))
            source_fields_reset = True
        else:
            source.update(**get_group_backfill_attributes(caches, source, source_events))

    (destination_id, eventstream_state) = migrate_events(
        caches,
        project,
        source_id,
        destination_id,
        fingerprints,
        destination_events,
        actor_id,
        eventstream_state,
    )

    repair_denormalizations(caches, project, events)

    unmerge.delay(
        project_id,
        source_id,
        destination_id,
        fingerprints,
        actor_id,
        last_event={"timestamp": events[-1].timestamp, "event_id": events[-1].event_id},
        batch_size=batch_size,
        source_fields_reset=source_fields_reset,
        eventstream_state=eventstream_state,
    )
コード例 #5
0
ファイル: unmerge.py プロジェクト: Kayle009/sentry
def unmerge(
    project_id,
    source_id,
    destination_id,
    fingerprints,
    actor_id,
    cursor=None,
    batch_size=500,
    source_fields_reset=False,
    eventstream_state=None,
):
    # XXX: The queryset chunking logic below is awfully similar to
    # ``RangeQuerySetWrapper``. Ideally that could be refactored to be able to
    # be run without iteration by passing around a state object and we could
    # just use that here instead.

    source = Group.objects.get(
        project_id=project_id,
        id=source_id,
    )

    # On the first iteration of this loop, we clear out all of the
    # denormalizations from the source group so that we can have a clean slate
    # for the new, repaired data.
    if cursor is None:
        fingerprints = lock_hashes(project_id, source_id, fingerprints)
        truncate_denormalizations(source)

    caches = get_caches()

    project = caches['Project'](project_id)

    # We fetch the events in descending order by their primary key to get the
    # best approximation of the most recently received events.
    queryset = Event.objects.filter(
        project_id=project_id,
        group_id=source_id,
    ).order_by('-id')

    if cursor is not None:
        queryset = queryset.filter(id__lt=cursor)

    events = list(queryset[:batch_size])

    # If there are no more events to process, we're done with the migration.
    if not events:
        tagstore.update_group_tag_key_values_seen(project_id, [source_id, destination_id])
        unlock_hashes(project_id, fingerprints)

        logger.warning('Unmerge complete (eventstream state: %s)', eventstream_state)
        if eventstream_state:
            eventstream.end_unmerge(eventstream_state)

        return destination_id

    Event.objects.bind_nodes(events, 'data')

    source_events = []
    destination_events = []

    for event in events:
        (destination_events
         if get_fingerprint(event) in fingerprints else source_events).append(event)

    if source_events:
        if not source_fields_reset:
            source.update(**get_group_creation_attributes(
                caches,
                source_events,
            ))
            source_fields_reset = True
        else:
            source.update(**get_group_backfill_attributes(
                caches,
                source,
                source_events,
            ))

    (destination_id, eventstream_state) = migrate_events(
        caches,
        project,
        source_id,
        destination_id,
        fingerprints,
        destination_events,
        actor_id,
        eventstream_state,
    )

    repair_denormalizations(
        caches,
        project,
        events,
    )

    unmerge.delay(
        project_id,
        source_id,
        destination_id,
        fingerprints,
        actor_id,
        cursor=events[-1].id,
        batch_size=batch_size,
        source_fields_reset=source_fields_reset,
        eventstream_state=eventstream_state,
    )
コード例 #6
0
 def stop_snuba_replacement(self,
                            eventstream_state: EventstreamState) -> None:
     eventstream.end_unmerge(eventstream_state)