def test_can_get_random_entry(self): db = HammingDb(self._path, code_size=16) for i in xrange(100): db.append(os.urandom(16), str(i)) code, results = db.random_search(10) results = list(results) self.assertEqual(10, len(results))
def test_external_modifications_are_detected_when_db_has_keys(self): db = HammingDb(self._path, code_size=8) db2 = HammingDb(self._path, code_size=8) db.append('a' * 8, 'some other data') db2.append('a' * 8, 'some data') self.assertEqual(2, len(db)) self.assertEqual(2, len(db2))
def test_can_run_in_write_only_mode(self): db = HammingDb(self._path, code_size=16, writeonly=True) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' extract_code = lambda x: self.extract_code_from_text(x, n_chunks=2) db.append(extract_code(t1), t1) db.append(extract_code(t2), t2) db.append(extract_code(t3), t3) db.append(extract_code(t4), t4) self.assertIsNone(db._codes)
def test_can_retrieve_data_from_search(self): db = HammingDb(self._path, code_size=8) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' db.append(self.extract_code_from_text(t1), t1) db.append(self.extract_code_from_text(t2), t2) db.append(self.extract_code_from_text(t3), t3) db.append(self.extract_code_from_text(t4), t4) results = list(db.search(self.extract_code_from_text(t1), 3)) data = results[0] self.assertEqual(t1, data)
def test_search_raises_in_write_only_mode(self): db = HammingDb(self._path, code_size=16, writeonly=True) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' extract_code = lambda x: self.extract_code_from_text(x, n_chunks=2) db.append(extract_code(t1), t1) db.append(extract_code(t2), t2) db.append(extract_code(t3), t3) db.append(extract_code(t4), t4) self.assertRaises(RuntimeError, lambda: list(db.search(extract_code(t1), 3)))
def test_can_search_over_text_documents(self): db = HammingDb(self._path, code_size=8) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' db.append(self.extract_code_from_text(t1), t1) db.append(self.extract_code_from_text(t2), t2) db.append(self.extract_code_from_text(t3), t3) db.append(self.extract_code_from_text(t4), t4) results = list(db.search(self.extract_code_from_text(t1), 3)) self.assertEqual(3, len(results)) self.assertEqual(t1, results[0]) self.assertEqual(t2, results[1]) self.assertEqual(t4, results[2])
def test_can_search_with_128_bits(self): db = HammingDb(self._path, code_size=16) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' extract_code = lambda x: self.extract_code_from_text(x, n_chunks=2) db.append(extract_code(t1), t1) db.append(extract_code(t2), t2) db.append(extract_code(t3), t3) db.append(extract_code(t4), t4) results = list(db.search(extract_code(t1), 3)) self.assertEqual(3, len(results)) self.assertEqual(t1, results[0]) self.assertEqual(t2, results[1]) self.assertEqual(t4, results[2])
def test_can_search_over_data_added_from_another_instance(self): db = HammingDb(self._path, code_size=8) db2 = HammingDb(self._path, code_size=8) t1 = 'Mary had a little lamb' t2 = 'Mary had a little dog' t3 = 'Permanent Midnight' t4 = 'Mary sad a little cog' db.append(self.extract_code_from_text(t1), t1) db.append(self.extract_code_from_text(t2), t2) db.append(self.extract_code_from_text(t3), t3) db.append(self.extract_code_from_text(t4), t4) results = list(db2.search(self.extract_code_from_text(t1), 3)) self.assertEqual(3, len(results)) s = set(results) self.assertTrue(t1 in s) self.assertTrue(t2 in s) self.assertTrue(t4 in s)
class HammingIndex(object): def __init__(self, document, feature, version=None, path='', db_size_bytes=1000000000, listen=False, writeonly=False, **extra_data): super(HammingIndex, self).__init__() self.document = document self.feature = feature self.db_size_bytes = db_size_bytes self.path = path self.extra_data = extra_data self.writeonly = writeonly version = version or self.feature.version self.hamming_db_path = os.path.join( self.path, 'index.{self.feature.key}.{version}'.format(**locals())) try: self.event_log = document.event_log except AttributeError: self.event_log = None try: self.hamming_db = HammingDb(self.hamming_db_path, code_size=None, writeonly=self.writeonly) except ValueError: self.hamming_db = None self.encoder = TimeSliceEncoder() self.decoder = TimeSliceDecoder() self.thread = None if listen: self.listen() def close(self): try: self.stop() except: pass try: self.hamming_db.close() except: pass def __del__(self): self.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __len__(self): if self.hamming_db is None: return 0 return len(self.hamming_db) def stop(self): self.event_log.unsubscribe() def listen(self): self.thread = threading.Thread(target=self._listen) self.thread.daemon = True self.thread.start() def _init_hamming_db(self, code=None): if self.hamming_db is not None: return code_size = len(code) if code else None self.hamming_db = HammingDb(self.hamming_db_path, code_size=code_size, writeonly=self.writeonly) def _synchronously_process_events(self): self._listen(raise_when_empty=True) def add_all(self): for doc in self.document: self.add(doc._id) def _collect_extra_data(self, _id, ts): if not self.extra_data: return None doc = self.document(_id) return dict(((key, func(doc, ts)) for key, func in self.extra_data.iteritems())) def add(self, _id, timestamp=''): # load the feature from the feature database feature = self.feature(_id=_id, persistence=self.document) try: arr = ConstantRateTimeSeries(feature) except ValueError: arr = feature # extract codes and timeslices from the feature for ts, data in arr.iter_slices(): code = self.encode_query(data) encoded_ts = dict(_id=_id, **self.encoder.dict(ts)) extra_data = self._collect_extra_data(_id, ts) if extra_data: encoded_ts['extra_data'] = extra_data self._init_hamming_db(code) self.hamming_db.append(code, json.dumps(encoded_ts)) self.hamming_db.set_metadata('timestamp', bytes(timestamp)) def _listen(self, raise_when_empty=False): if self.hamming_db is not None: last_timestamp = self.hamming_db.get_metadata('timestamp') or '' else: last_timestamp = '' if not self.event_log: raise ValueError( '{self.document} must have an event log configured'.format( **locals())) subscription = self.event_log.subscribe( last_id=last_timestamp, raise_when_empty=raise_when_empty) for timestamp, data in subscription: # parse the data from the event stream data = json.loads(data) _id, name, version = data['_id'], data['name'], data['version'] # ensure that it's about the feature we're subscribed to if name != self.feature.key or version != self.feature.version: continue self.add(_id, timestamp) def _parse_result(self, result): d = json.loads(result) ts = TimeSlice(**self.decoder.kwargs(d)) if not self.extra_data: return d['_id'], ts return d['_id'], ts, d['extra_data'] def decode_query(self, binary_query): packed = np.fromstring(binary_query, dtype=np.uint8) return np.unpackbits(packed) def encode_query(self, feature): if isinstance(feature, str): return feature elif feature.dtype == np.uint64: return feature.tostring() elif feature.dtype == np.uint8 or feature.dtype == np.bool: return np.packbits(feature).tostring() else: raise ValueError( 'feature must be a raw bit string, an already packed uint64' 'array, or an "unpacked" uint8 or bool array') def random_search(self, n_results, multithreaded=False, sort=False): self._init_hamming_db() code, raw_results = self.hamming_db.random_search(n_results, multithreaded, sort=sort) parsed_results = (self._parse_result(r) for r in raw_results) return SearchResults(code, parsed_results) def search(self, feature, n_results, multithreaded=False, sort=False): self._init_hamming_db() code = self.encode_query(feature) raw_results = self.hamming_db.search(code, n_results, multithreaded, sort=sort) parsed_results = (self._parse_result(r) for r in raw_results) return SearchResults(code, parsed_results)
def test_can_append_with_128_bits(self): db = HammingDb(self._path, code_size=16) db.append('a' * 16, 'some data') self.assertEqual(1, len(db))
def test_db_starts_with_correct_number_of_keys(self): db2 = HammingDb(self._path, code_size=8) db2.append('a' * 8, 'some data') db = HammingDb(self._path, code_size=8) self.assertEqual(1, len(db))
def test_external_modifications_are_detected(self): db = HammingDb(self._path, code_size=8) db2 = HammingDb(self._path, code_size=8) db2.append('a' * 8, 'some data') self.assertEqual(1, len(db))
def test_cannot_append_wrong_code_size(self): db = HammingDb(self._path, code_size=8) self.assertRaises(ValueError, lambda: db.append('a' * 7, 'some data'))
def test_db_has_length_two_after_appending_twice(self): db = HammingDb(self._path, code_size=8) db.append('a' * 8, 'some data') db.append('a' * 8, 'some data') self.assertEqual(2, len(db))