def test_contains_id(self): dim = 100 act = 10 gen = Generator(dim, act) sign_index = SignIndex(gen) id = 0 self.assertFalse(sign_index.contains_id(id))
def test_get(self): dim = 100 act = 10 gen = Generator(dim, act) sign_index = SignIndex(gen) sign_index.add("0") ri0 = sign_index.get_ri("0") self.assertIsInstance(ri0, RandomIndex) self.assertEqual(ri0.dim, dim)
def test_contains(self): dim = 100 act = 10 gen = Generator(dim, act) sign_index = SignIndex(generator=gen) sign_index.add("0") self.assertTrue(sign_index.contains("0")) self.assertFalse(sign_index.contains("1")) sign_index.remove("0") self.assertFalse(sign_index.contains("0"))
class MyTestCase(unittest.TestCase): def setUp(self): dim = 10 act = 4 self.generator = Generator(dim=dim, num_active=act) self.sign_index = SignIndex(self.generator) def test_encode_sp_create(self): sentence = ["A", "B"] for word in sentence: self.sign_index.add(word) ris = [] for word in sentence: ri = self.sign_index.get_ri(word) ris.append(ri) result = ris_to_sp_tensor_value(ris, self.sign_index.feature_dim()) print(result) def test_encode_sp_positive(self): """ Testing encoding for positive-only sparse random vectors """ sentence = ["A", "B"] for word in sentence: self.sign_index.add(word) ris = [] for word in sentence: ri = self.sign_index.get_ri(word) ris.append(ri) result = ris_to_sp_tensor_value(ris, self.sign_index.feature_dim(), all_positive=True) print(result)
def setUp(self): dim = 10 act = 4 self.generator = Generator(dim=dim, num_active=act) self.sign_index = SignIndex(self.generator)
def setUp(self): dim = 10 act = 2 self.generator = Generator(dim=dim, num_active=act) self.sign_index = SignIndex(self.generator) self.perm_generator = PermutationGenerator(dim=dim)
class TestEncode(unittest.TestCase): def setUp(self): dim = 10 act = 2 self.generator = Generator(dim=dim, num_active=act) self.sign_index = SignIndex(self.generator) self.perm_generator = PermutationGenerator(dim=dim) def test_bow_create(self): data = ["A", "B", "A", "C", "A", "B"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) vectors = [enc.to_bow(w, self.sign_index) for w in windows] self.assertEqual(len(vectors), len(windows)) def test_bow_normalise(self): data = ["A", "A"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True) self.assertEqual(np.max(norm_bow),1) unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True) self.assertEqual(np.max(unorm_bow),2) def test_bow_ignore_order(self): data1 = ["A", "B"] data2 = ["B", "A"] for s1, s2 in data1, data2: self.sign_index.add(s1) self.sign_index.add(s2) windows1 = windows(data1, window_size=1) windows2 = windows(data2, window_size=1) v1 = enc.to_bow(windows1[0], self.sign_index) v2 = enc.to_bow(windows2[0], self.sign_index) np_test.assert_array_equal(v1, v2) np_test.assert_array_equal(v1, v2) a_ri = self.sign_index.get_ri("A") b_ri = self.sign_index.get_ri("B") np_test.assert_array_equal(v1 - a_ri.to_vector(), b_ri.to_vector()) def test_bow_dir_create(self): data1 = ["A", "B", "C"] data2 = ["A", "C", "B"] for i in range(len(data1)): self.sign_index.add(data1[i]) self.sign_index.add(data2[i]) w1 = windows(data1, window_size=2) w2 = windows(data2, window_size=2) perm = self.perm_generator.matrix() v1 = enc.to_bow_dir(w1[0], sign_index=self.sign_index, perm_matrix=perm) v2 = enc.to_bow_dir(w2[0], sign_index=self.sign_index, perm_matrix=perm) self.assertSetEqual(set(w1[0].right), set(w2[0].right)) np_test.assert_array_equal(v1, v2)
result_path = home + "/data/results/" corpus_file = home + corpus_file print("Reading hdf5 dataset from: ", corpus_file) dataset_name = "sentences_lemmatised" # open hdf5 file and get the dataset h5f = h5py.File(corpus_file, 'r') dataset = h5f[dataset_name] return dataset # do something with the dataset # Create Sign RI Index ri_gen = Generator(dim=ri_dim, num_active=ri_num_active) sign_index = SignIndex(ri_gen) max_sentences = 200000 def load_spacy(): t0 = time.time() # load tokenizer only nlp = English(entity=False, load_vectors=False, parser=True, tagger=True) t1 = time.time() print("Done: {0:.2f} secs ".format(t1 - t0)) return nlp nlp = load_spacy()
def test_size(self): gen = Generator(100, 10) sign_index = SignIndex(generator=gen) # adding elements should increase size self.assertEqual(len(sign_index), 0) sign_index.add("0") self.assertEqual(len(sign_index), 1) self.assertEqual(sign_index.nextID, sign_index.get_id("0") + 1) # duplicated elements are not added sign_index.add("0") self.assertEqual(len(sign_index), 1) sign_index.add("1") self.assertEqual(len(sign_index), 2) # removing elements should reduce size size_before = len(sign_index) sign_index.remove("0") size_after = len(sign_index) self.assertEqual(size_after, size_before - 1)
from deepsign.data.corpora.pipe import BNCPipe from deepsign.rp.encode import to_bow from deepsign.rp.index import SignIndex, Generator from deepsign.data.iterators import chunk_it, windows home = os.getenv("HOME") data_dir = home + "/data/gold_standards/" corpus_file = data_dir + "bnc.hdf5" corpus_hdf5 = h5py.File(corpus_file, 'r') corpus_dataset = corpus_hdf5["sentences"] n_rows = 1000 sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000) pipeline = BNCPipe(datagen=sentences, lemmas=True) ri_gen = Generator(1000, 10) index = SignIndex(ri_gen) for s in tqdm(pipeline, total=n_rows): index.add_all(s) windows = windows(s, window_size=2) for window in windows: pass #words = window.left + window.right #ris = [index.get_ri(word).to_vector() for word in words] bow = to_bow(window, index, include_target=False, normalise=True)