def _make_index(cluster=None): if not cluster: cluster_id = getattr( settings, 'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER', 'similarity', ) try: cluster = redis.redis_clusters.get(cluster_id) except KeyError: index = DummyIndex() logger.info( 'No redis cluster provided for similarity, using {!r}.'.format( index)) return index return MinHashIndex( cluster, 'sim:1', MinHashSignatureBuilder(16, 0xFFFF), 8, 60 * 60 * 24 * 30, 3, )
def index(self): return MinHashIndex( redis.clusters.get('default').get_local_client(0), 'sim', signature_builder, 16, 60 * 60, 12, )
def test_basic(self): self.index.record('example', '1', [('index', 'hello world')]) self.index.record('example', '2', [('index', 'hello world')]) self.index.record('example', '3', [('index', 'jello world')]) self.index.record('example', '4', [ ('index', 'yellow world'), ('index', 'mellow world'), ]) self.index.record('example', '5', [('index', 'pizza world')]) results = self.index.compare('example', '1', ['index'])[0] assert results[0] == ('1', 1.0) assert results[1] == ('2', 1.0) # identical contents assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') assert results[4][0] == '5' results = self.index.classify('example', [('index', 'hello world')])[0] assert results[0:2] == [('1', 1.0), ('2', 1.0)] assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') assert results[4][0] == '5' self.index.delete('example', [('index', '3')]) assert [ key for key, _ in self.index.compare('example', '1', ['index'])[0] ] == ['1', '2', '4', '5'] assert MinHashIndex( self.index.cluster, self.index.namespace + '2', self.index.signature_builder, self.index.bands, self.index.interval, self.index.retention, ).compare('example', '1', ['index']) == [[]]
def test_basic(self): self.index.record('example', '1', [('index', 'hello world')]) self.index.record('example', '2', [('index', 'hello world')]) self.index.record('example', '3', [('index', 'jello world')]) self.index.record('example', '4', [ ('index', 'yellow world'), ('index', 'mellow world'), ]) self.index.record('example', '5', [('index', 'pizza world')]) # comparison, without thresholding results = self.index.compare('example', '1', [('index', 0)]) assert results[0] == ('1', [1.0]) assert results[1] == ('2', [1.0]) # identical contents assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') assert results[4][0] == '5' # comparison, low threshold results = self.index.compare('example', '1', [('index', 6)]) assert len(results) == 4 assert results[0] == ('1', [1.0]) assert results[1] == ('2', [1.0]) # identical contents assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') # comparison, high threshold (exact match) results = self.index.compare('example', '1', [('index', self.index.bands)]) assert len(results) == 2 assert results[0] == ('1', [1.0]) assert results[1] == ('2', [1.0]) # identical contents # comparison, candidate limit (with lexicographical collision sort) results = self.index.compare('example', '1', [('index', 0)], limit=1) assert len(results) == 1 assert results[0] == ('1', [1.0]) # classification, without thresholding results = self.index.classify('example', [('index', 0, 'hello world')]) assert results[0:2] == [('1', [1.0]), ('2', [1.0])] assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') assert results[4][0] == '5' # classification, low threshold results = self.index.classify('example', [('index', 6, 'hello world')]) assert len(results) == 4 assert results[0] == ('1', [1.0]) assert results[1] == ('2', [1.0]) # identical contents assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') # classification, high threshold (exact match) results = self.index.classify( 'example', [('index', self.index.bands, 'hello world')]) assert len(results) == 2 assert results[0] == ('1', [1.0]) assert results[1] == ('2', [1.0]) # identical contents # classification, candidate limit (with lexicographical collision sort) results = self.index.classify('example', [('index', 0, 'hello world')], limit=1) assert len(results) == 1 assert results[0] == ('1', [1.0]) self.index.delete('example', [('index', '3')]) assert [ key for key, _ in self.index.compare('example', '1', [('index', 0)]) ] == ['1', '2', '4', '5'] assert MinHashIndex( self.index.cluster, self.index.namespace + '2', self.index.signature_builder, self.index.bands, self.index.interval, self.index.retention, ).compare('example', '1', [('index', 0)]) == []
break else: raise FrameEncodingError( 'Cannot encode a frame without a `module` or `filename` value.') return attributes features = FeatureSet( MinHashIndex( redis.clusters.get( getattr( settings, 'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER', 'default', ), ), 'sim:1', MinHashSignatureBuilder(16, 0xFFFF), 8, 60 * 60 * 24 * 30, 3, ), Encoder({ Frame: get_frame_attributes, }), BidirectionalMapping({ 'exception:message:character-shingles': 'a', 'exception:stacktrace:application-chunks': 'b', 'exception:stacktrace:pairs': 'c', 'message:message:character-shingles': 'd', }),
def test_export_import(self): retention = 12 index = MinHashIndex( redis.clusters.get('default'), 'sim', signature_builder, 8, 60 * 60, retention, ) index.record('example', '1', [('index', 'hello world')]) timestamp = int(time.time()) result = index.export('example', [('index', 1)], timestamp=timestamp) assert len(result) == 1 data = msgpack.unpackb(result[0]) assert len(data) == index.bands for band in data: assert len(band) == (retention + 1) assert sum( sum(dict(bucket_frequencies).values()) for index, bucket_frequencies in band) == 1 # Copy the data from key 1 to key 2. index.import_('example', [('index', 2, result[0])], timestamp=timestamp) assert index.export('example', [('index', 1)], timestamp=timestamp) == index.export( 'example', [('index', 2)], timestamp=timestamp) # Copy the data again to key 2 (duplicating all of the data.) index.import_('example', [('index', 2, result[0])], timestamp=timestamp) result = index.export('example', [('index', 2)], timestamp=timestamp) assert len(result) == 1 data = msgpack.unpackb(result[0]) assert len(data) == index.bands for band in data: assert len(band) == (retention + 1) assert sum( sum(dict(bucket_frequencies).values()) for index, bucket_frequencies in band) == 2
def test_index(self): index = MinHashIndex( redis.clusters.get('default'), 'sim', signature_builder, 8, 60 * 60, 12, ) index.record('example', '1', [('index', 'hello world')]) index.record('example', '2', [('index', 'hello world')]) index.record('example', '3', [('index', 'jello world')]) index.record('example', '4', [('index', 'yellow world')]) index.record('example', '4', [('index', 'mellow world')]) index.record('example', '5', [('index', 'pizza world')]) results = index.query('example', '1', ['index'])[0] assert results[0] == ('1', 1.0) assert results[1] == ('2', 1.0) # identical contents assert results[2][0] in ( '3', '4') # equidistant pairs, order doesn't really matter assert results[3][0] in ('3', '4') assert results[4][0] == '5' index.delete('example', [('index', '3')]) assert [key for key, _ in index.query('example', '1', ['index'])[0] ] == ['1', '2', '4', '5'] assert MinHashIndex( redis.clusters.get('default'), 'sim2', signature_builder, 8, 60 * 60, 12, ).query('example', '1', ['index']) == [[]]
ExceptionFeature, MessageFeature, serialize_frame, serialize_text_shingle, get_application_chunks, get_exception_frames, ) features = FeatureSet( MinHashIndex( redis.clusters.get( getattr( settings, 'SENTRY_SIMILARITY_INDEX_REDIS_CLUSTER', 'default', ), ), 0xFFFF, 8, 2, 60 * 60 * 24 * 30, 3, ), BidirectionalMapping({ 'exception:message:character-shingles': 'a', 'exception:stacktrace:application-chunks': 'b', 'exception:stacktrace:pairs': 'c', 'message:message:character-shingles': 'd', }), { 'exception:message:character-shingles': ExceptionFeature(lambda exception: map( serialize_text_shingle,