def _bulk_rank_error(self, function): """Throws ValueError if len(vec) != self.rank for any vec in vecs.""" vec_short = HyperplaneHasher._random_vectors(1, self.hhenl.rank - 1) vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank) + vec_short vec_ids = self.letters[:11] self.hhenl._bulk_label_chamber_ensemble(vecs[:10], vec_ids[:10]) self.assertRaises(ValueError, function, *[vecs, vec_ids])
def test_label_chamber_ensemble_1(self): """For each underlying HyperplaneHasher object, a new label is added to precisely one chamber. The set of chamber ids present as keys in self.kvstore is either unchanged, or enlarged by one element.""" feature_vecs = self.feature_vecs old_chamber_ids = {hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs} old_chamber_labels = {hh: [hh.chamber_labels(ch_id) for ch_id in old_chamber_ids[hh]] for hh in self.hhenl.hhs} new_vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] self.hhenl._label_chamber_ensemble(new_vec, 'new_vec_id') feature_vecs.append(new_vec) new_chamber_ids = {hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs} new_chamber_labels = {hh: [hh.chamber_labels(ch_id) for ch_id in new_chamber_ids[hh]] for hh in self.hhenl.hhs} for hh in self.hhenl.hhs: len_diff = len(new_chamber_ids[hh]) - len(old_chamber_ids[hh]) self.assertIn(len_diff, [0, 1]) if len_diff == 0: #vector 'new_vec' has landed in an existing chamber. #the set of chamber ids thus remains unchanged, but #exactly one chamber has exactly one new label, #namely 'new_vec_id' self.assertEqual(old_chamber_ids[hh], new_chamber_ids[hh]) comparison = list(np.array(old_chamber_labels[hh]) == np.array(new_chamber_labels[hh])) expected_bools = set([False] + [True] * (len(old_chamber_ids) - 1)) self.assertEqual(set(comparison), expected_bools) label_diff = new_chamber_labels[hh][comparison.index(False)].difference(old_chamber_labels[hh][comparison.index(False)]) self.assertEqual(label_diff, set(['new_vec_id'])) if len_diff == 1: #vector 'new_vec' has landed in a new chamber. #The id of the new chamber is that of the chamber to #which 'new_vec' belongs, and the new chamber #is exactly set(['new_vec_id']). id_diff = new_chamber_ids[hh].difference(old_chamber_ids[hh]) self.assertEqual(id_diff, set([hh.get_chamber_id(new_vec)])) labels_diff = [entry for entry in new_chamber_labels[hh] if entry not in old_chamber_labels[hh]][0] self.assertEqual(labels_diff, set(['new_vec_id']))
def test_bulk_label_chamber_ensemble_5(self): """Let first = [first_1, first_2, ..., first_n] and second = [second_1, second_2, ..., second_n] be lists of labels, and vecs = [vec_1, vec_2, ..., vec_n] a list of vectors. Then after applying the method first to (vecs, first), then to (vecs, second), all chambers C in all hh in self.hhenl.hhs have the property that first_i in C iff second_i in C.""" vecs = HyperplaneHasher._random_vectors(20, self.hhenl.rank) first_ex = re.compile(r'first_([\S]*)') second_ex = re.compile(r'second_([\S]*)') first = ['first_%i' % i for i in range(20)] second = ['second_%i' % i for i in range(20)] self.hhenl._bulk_label_chamber_ensemble(vecs, first) self.hhenl._bulk_label_chamber_ensemble(vecs, second) for hh in self.hhenl.hhs: ch_ids = hh.get_chamber_ids() for ch_id in ch_ids: labels = hh.chamber_labels(ch_id) flabels = [ ''.join(first_ex.findall(label)) for label in labels ] first_labels = set( [entry for entry in flabels if len(entry) > 0]) slabels = [ ''.join(second_ex.findall(label)) for label in labels ] second_labels = set( [entry for entry in slabels if len(entry) > 0]) self.assertEqual(first_labels, second_labels)
def test_get_nn_candidates_1(self): """Returned objects is a set of strings of length at least num_neighbours.""" vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] nn = 10 result = self.hhenl._get_nn_candidates(vec, nn) self.assertIsInstance(result, set) for element in result: self.assertIsInstance(element, str) self.assertGreaterEqual(len(result), nn)
def setUp(self): """Create a HHEnsembleLookup object whose underlying KeyValueStore object is a DictionaryStore instance populated by NUM_VECS feature vectors.""" self.letters = list(string.ascii_lowercase) self.feature_vecs = HyperplaneHasher._random_vectors(NUM_VECS, RANK) self.feature_vecs_ids = ['%i' % i for i in range(NUM_VECS)] self.hhenl = self._create_hhenl() for pair in zip(self.feature_vecs, self.feature_vecs_ids): vec, vec_id = pair self.hhenl.add_vector(vec, vec_id)
def test_label_chamber_ensemble_1(self): """For each underlying HyperplaneHasher object, a new label is added to precisely one chamber. The set of chamber ids present as keys in self.kvstore is either unchanged, or enlarged by one element.""" feature_vecs = self.feature_vecs old_chamber_ids = { hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs } old_chamber_labels = { hh: [hh.chamber_labels(ch_id) for ch_id in old_chamber_ids[hh]] for hh in self.hhenl.hhs } new_vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] self.hhenl._label_chamber_ensemble(new_vec, 'new_vec_id') feature_vecs.append(new_vec) new_chamber_ids = { hh: set([hh.get_chamber_id(vec) for vec in feature_vecs]) for hh in self.hhenl.hhs } new_chamber_labels = { hh: [hh.chamber_labels(ch_id) for ch_id in new_chamber_ids[hh]] for hh in self.hhenl.hhs } for hh in self.hhenl.hhs: len_diff = len(new_chamber_ids[hh]) - len(old_chamber_ids[hh]) self.assertIn(len_diff, [0, 1]) if len_diff == 0: #vector 'new_vec' has landed in an existing chamber. #the set of chamber ids thus remains unchanged, but #exactly one chamber has exactly one new label, #namely 'new_vec_id' self.assertEqual(old_chamber_ids[hh], new_chamber_ids[hh]) comparison = list( np.array(old_chamber_labels[hh]) == np.array( new_chamber_labels[hh])) expected_bools = set([False] + [True] * (len(old_chamber_ids) - 1)) self.assertEqual(set(comparison), expected_bools) label_diff = new_chamber_labels[hh][comparison.index( False)].difference( old_chamber_labels[hh][comparison.index(False)]) self.assertEqual(label_diff, set(['new_vec_id'])) if len_diff == 1: #vector 'new_vec' has landed in a new chamber. #The id of the new chamber is that of the chamber to #which 'new_vec' belongs, and the new chamber #is exactly set(['new_vec_id']). id_diff = new_chamber_ids[hh].difference(old_chamber_ids[hh]) self.assertEqual(id_diff, set([hh.get_chamber_id(new_vec)])) labels_diff = [ entry for entry in new_chamber_labels[hh] if entry not in old_chamber_labels[hh] ][0] self.assertEqual(labels_diff, set(['new_vec_id']))
def test_bulk_label_chamber_ensemble_3(self): """If vec_ids are all unknown, then for each hh in self.hhenl.hhs, the difference in the union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before and after the bulk_label, is equal to vec_ids.""" vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank) vec_ids = self.letters[:10] labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs] self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids) labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs] for b, a in zip(labels_before, labels_after): self.assertEqual(a.difference(b), set(vec_ids))
def test_bulk_label_chamber_ensemble_4(self): """If vec_ids are partially known, then for each hh in self.hhenl.hhs, the difference in the union over all chamber_ids in hh.get_chamber_ids() of hh.chamber_labels(chamber_id), before and after the bulk_label, is equal to the unknown vec_ids.""" vecs = HyperplaneHasher._random_vectors(24, self.hhenl.rank) old_vec_ids = self.feature_vecs_ids[:11] new_vec_ids = self.letters[:13] vec_ids = old_vec_ids + new_vec_ids labels_before = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs] self.hhenl._bulk_label_chamber_ensemble(vecs, vec_ids) labels_after = [self._get_all_hh_labels(hh) for hh in self.hhenl.hhs] for b, a in zip(labels_before, labels_after): self.assertEqual(a.difference(b), set(new_vec_ids))
def test_add_vector_1(self): """Adds 'vec' both to self.hhenl.kvstore, and to exactly one chamber of each underlying HyperplaneHasher object. Subsequently, the lists of keys of vectors in the objects self.hhenl.kvstore and self.hhenl.hhs[i].kvstore are identical, for all i.""" vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] vec_id = 'new' self.hhenl.add_vector(vec, vec_id) self.assertTrue((self.hhenl.get_vector(vec_id) == vec).all()) all_vec_ids = self.hhenl.get_vector_ids() self.assertIn(vec_id, all_vec_ids) for hh in self.hhenl.hhs: chamber_id = hh.get_chamber_id(vec) self.assertIn(vec_id, hh.chamber_labels(chamber_id))
def test_add_vector_1(self): """Adds 'vec' both to self.hhenl.kvstore, and to exactly one chamber of each underlying HyperplaneHasher object. Subsequently, the lists of keys of vectors in the objects self.hhenl.kvstore and self.hhenl.hhs[i].kvstore are identical, for all i.""" vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] vec_id = 'new' self.hhenl.add_vector(vec, vec_id) self.assertTrue((self.hhenl.get_vector(vec_id)==vec).all()) all_vec_ids = self.hhenl.get_vector_ids() self.assertIn(vec_id, all_vec_ids) for hh in self.hhenl.hhs: chamber_id = hh.get_chamber_id(vec) self.assertIn(vec_id, hh.chamber_labels(chamber_id))
def test_find_neighbours_1(self): """Returns a pandas series of length 'num_neighbours', indexed by keys that can successfully be passed to the get_vector() method. The entries of 'ser' are non-negative real numbers, in ascending order. If the input vector is known to the underlying KeyValueStore object, then the first entry has value 0.0 and key == 'vec_id', where 'vec_id' is the id of the input vector.""" vec = HyperplaneHasher._random_vectors(1, self.hhenl.rank)[0] nn = 10 neighbours = self.hhenl.find_neighbours(vec, nn) self.assertIsInstance(neighbours, pd.Series) self.assertEqual(len(neighbours), nn) self.assertTrue((neighbours == neighbours.order()).all()) for i in range(len(neighbours)): self.assertGreaterEqual(neighbours[i], 0.0)
def test_bulk_label_chamber_ensemble_5(self): """Let first = [first_1, first_2, ..., first_n] and second = [second_1, second_2, ..., second_n] be lists of labels, and vecs = [vec_1, vec_2, ..., vec_n] a list of vectors. Then after applying the method first to (vecs, first), then to (vecs, second), all chambers C in all hh in self.hhenl.hhs have the property that first_i in C iff second_i in C.""" vecs = HyperplaneHasher._random_vectors(20, self.hhenl.rank) first_ex = re.compile(r'first_([\S]*)') second_ex = re.compile(r'second_([\S]*)') first = ['first_%i' % i for i in range(20)] second = ['second_%i' % i for i in range(20)] self.hhenl._bulk_label_chamber_ensemble(vecs, first) self.hhenl._bulk_label_chamber_ensemble(vecs, second) for hh in self.hhenl.hhs: ch_ids = hh.get_chamber_ids() for ch_id in ch_ids: labels = hh.chamber_labels(ch_id) flabels = [''.join(first_ex.findall(label)) for label in labels] first_labels = set([entry for entry in flabels if len(entry) > 0]) slabels = [''.join(second_ex.findall(label)) for label in labels] second_labels = set([entry for entry in slabels if len(entry) > 0]) self.assertEqual(first_labels, second_labels)
def test_label_chamber_ensemble_2(self): """Throws ValueError if len(vec) != self.rank.""" new_vec_short = HyperplaneHasher._random_vectors(1, self.hhenl.rank - 1)[0] self.assertRaises(ValueError, self.hhenl._label_chamber_ensemble, *[new_vec_short, 'new_vec_short_id'])
def _bulk_list_length_error(self, function): """Throws ValueError if len(vec_ids) != len(vec_ids).""" vecs = HyperplaneHasher._random_vectors(10, self.hhenl.rank) vec_ids = self.letters[:11] self.assertRaises(ValueError, function, *[vecs, vec_ids])
def test_label_chamber_ensemble_2(self): """Throws ValueError if len(vec) != self.rank.""" new_vec_short = HyperplaneHasher._random_vectors( 1, self.hhenl.rank - 1)[0] self.assertRaises(ValueError, self.hhenl._label_chamber_ensemble, *[new_vec_short, 'new_vec_short_id'])