Beispiel #1
0
def _make_index(cluster=None):
    if not cluster:
        cluster_id = getattr(
            settings,
            'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER',
            'similarity',
        )

        try:
            cluster = redis.redis_clusters.get(cluster_id)
        except KeyError:
            index = DummyIndex()
            logger.info(
                'No redis cluster provided for similarity, using {!r}.'.format(
                    index))
            return index

    return MinHashIndex(
        cluster,
        'sim:1',
        MinHashSignatureBuilder(16, 0xFFFF),
        8,
        60 * 60 * 24 * 30,
        3,
    )
Beispiel #2
0
 def index(self):
     return MinHashIndex(
         redis.clusters.get('default').get_local_client(0),
         'sim',
         signature_builder,
         16,
         60 * 60,
         12,
     )
Beispiel #3
0
    def test_basic(self):
        self.index.record('example', '1', [('index', 'hello world')])
        self.index.record('example', '2', [('index', 'hello world')])
        self.index.record('example', '3', [('index', 'jello world')])
        self.index.record('example', '4', [
            ('index', 'yellow world'),
            ('index', 'mellow world'),
        ])
        self.index.record('example', '5', [('index', 'pizza world')])

        results = self.index.compare('example', '1', ['index'])[0]
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        results = self.index.classify('example', [('index', 'hello world')])[0]
        assert results[0:2] == [('1', 1.0), ('2', 1.0)]
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        self.index.delete('example', [('index', '3')])
        assert [
            key for key, _ in self.index.compare('example', '1', ['index'])[0]
        ] == ['1', '2', '4', '5']

        assert MinHashIndex(
            self.index.cluster,
            self.index.namespace + '2',
            self.index.signature_builder,
            self.index.bands,
            self.index.interval,
            self.index.retention,
        ).compare('example', '1', ['index']) == [[]]
Beispiel #4
0
    def test_basic(self):
        self.index.record('example', '1', [('index', 'hello world')])
        self.index.record('example', '2', [('index', 'hello world')])
        self.index.record('example', '3', [('index', 'jello world')])
        self.index.record('example', '4', [
            ('index', 'yellow world'),
            ('index', 'mellow world'),
        ])
        self.index.record('example', '5', [('index', 'pizza world')])

        # comparison, without thresholding
        results = self.index.compare('example', '1', [('index', 0)])
        assert results[0] == ('1', [1.0])
        assert results[1] == ('2', [1.0])  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        # comparison, low threshold
        results = self.index.compare('example', '1', [('index', 6)])
        assert len(results) == 4
        assert results[0] == ('1', [1.0])
        assert results[1] == ('2', [1.0])  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')

        # comparison, high threshold (exact match)
        results = self.index.compare('example', '1',
                                     [('index', self.index.bands)])
        assert len(results) == 2
        assert results[0] == ('1', [1.0])
        assert results[1] == ('2', [1.0])  # identical contents

        # comparison, candidate limit (with lexicographical collision sort)
        results = self.index.compare('example', '1', [('index', 0)], limit=1)
        assert len(results) == 1
        assert results[0] == ('1', [1.0])

        # classification, without thresholding
        results = self.index.classify('example', [('index', 0, 'hello world')])
        assert results[0:2] == [('1', [1.0]), ('2', [1.0])]
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        # classification, low threshold
        results = self.index.classify('example', [('index', 6, 'hello world')])
        assert len(results) == 4
        assert results[0] == ('1', [1.0])
        assert results[1] == ('2', [1.0])  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')

        # classification, high threshold (exact match)
        results = self.index.classify(
            'example', [('index', self.index.bands, 'hello world')])
        assert len(results) == 2
        assert results[0] == ('1', [1.0])
        assert results[1] == ('2', [1.0])  # identical contents

        # classification, candidate limit (with lexicographical collision sort)
        results = self.index.classify('example', [('index', 0, 'hello world')],
                                      limit=1)
        assert len(results) == 1
        assert results[0] == ('1', [1.0])

        self.index.delete('example', [('index', '3')])
        assert [
            key
            for key, _ in self.index.compare('example', '1', [('index', 0)])
        ] == ['1', '2', '4', '5']

        assert MinHashIndex(
            self.index.cluster,
            self.index.namespace + '2',
            self.index.signature_builder,
            self.index.bands,
            self.index.interval,
            self.index.retention,
        ).compare('example', '1', [('index', 0)]) == []
Beispiel #5
0
            break
    else:
        raise FrameEncodingError(
            'Cannot encode a frame without a `module` or `filename` value.')

    return attributes


features = FeatureSet(
    MinHashIndex(
        redis.clusters.get(
            getattr(
                settings,
                'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER',
                'default',
            ), ),
        'sim:1',
        MinHashSignatureBuilder(16, 0xFFFF),
        8,
        60 * 60 * 24 * 30,
        3,
    ),
    Encoder({
        Frame: get_frame_attributes,
    }),
    BidirectionalMapping({
        'exception:message:character-shingles': 'a',
        'exception:stacktrace:application-chunks': 'b',
        'exception:stacktrace:pairs': 'c',
        'message:message:character-shingles': 'd',
    }),
Beispiel #6
0
    def test_export_import(self):
        retention = 12
        index = MinHashIndex(
            redis.clusters.get('default'),
            'sim',
            signature_builder,
            8,
            60 * 60,
            retention,
        )

        index.record('example', '1', [('index', 'hello world')])

        timestamp = int(time.time())
        result = index.export('example', [('index', 1)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == index.bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(
                sum(dict(bucket_frequencies).values())
                for index, bucket_frequencies in band) == 1

        # Copy the data from key 1 to key 2.
        index.import_('example', [('index', 2, result[0])],
                      timestamp=timestamp)

        assert index.export('example', [('index', 1)],
                            timestamp=timestamp) == index.export(
                                'example', [('index', 2)], timestamp=timestamp)

        # Copy the data again to key 2 (duplicating all of the data.)
        index.import_('example', [('index', 2, result[0])],
                      timestamp=timestamp)

        result = index.export('example', [('index', 2)], timestamp=timestamp)
        assert len(result) == 1

        data = msgpack.unpackb(result[0])
        assert len(data) == index.bands

        for band in data:
            assert len(band) == (retention + 1)
            assert sum(
                sum(dict(bucket_frequencies).values())
                for index, bucket_frequencies in band) == 2
Beispiel #7
0
    def test_index(self):
        index = MinHashIndex(
            redis.clusters.get('default'),
            'sim',
            signature_builder,
            8,
            60 * 60,
            12,
        )

        index.record('example', '1', [('index', 'hello world')])
        index.record('example', '2', [('index', 'hello world')])
        index.record('example', '3', [('index', 'jello world')])
        index.record('example', '4', [('index', 'yellow world')])
        index.record('example', '4', [('index', 'mellow world')])
        index.record('example', '5', [('index', 'pizza world')])

        results = index.query('example', '1', ['index'])[0]
        assert results[0] == ('1', 1.0)
        assert results[1] == ('2', 1.0)  # identical contents
        assert results[2][0] in (
            '3', '4')  # equidistant pairs, order doesn't really matter
        assert results[3][0] in ('3', '4')
        assert results[4][0] == '5'

        index.delete('example', [('index', '3')])
        assert [key for key, _ in index.query('example', '1', ['index'])[0]
                ] == ['1', '2', '4', '5']

        assert MinHashIndex(
            redis.clusters.get('default'),
            'sim2',
            signature_builder,
            8,
            60 * 60,
            12,
        ).query('example', '1', ['index']) == [[]]
Beispiel #8
0
    ExceptionFeature,
    MessageFeature,
    serialize_frame,
    serialize_text_shingle,
    get_application_chunks,
    get_exception_frames,
)

features = FeatureSet(
    MinHashIndex(
        redis.clusters.get(
            getattr(
                settings,
                'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER',
                'default',
            ), ),
        0xFFFF,
        8,
        2,
        60 * 60 * 24 * 30,
        3,
    ),
    BidirectionalMapping({
        'exception:message:character-shingles': 'a',
        'exception:stacktrace:application-chunks': 'b',
        'exception:stacktrace:pairs': 'c',
        'message:message:character-shingles': 'd',
    }), {
        'exception:message:character-shingles':
        ExceptionFeature(lambda exception: map(
            serialize_text_shingle,