def get_window_stream(pipeline): if subsampling: windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline) else: windows_stream = (windows(tokens, window_size) for tokens in pipeline) return windows_stream
def test_bow_dir_create(self): data1 = ["A", "B", "C"] data2 = ["A", "C", "B"] for i in range(len(data1)): self.sign_index.add(data1[i]) self.sign_index.add(data2[i]) w1 = windows(data1, window_size=2) w2 = windows(data2, window_size=2) perm = self.perm_generator.matrix() v1 = enc.to_bow_dir(w1[0], sign_index=self.sign_index, perm_matrix=perm) v2 = enc.to_bow_dir(w2[0], sign_index=self.sign_index, perm_matrix=perm) self.assertSetEqual(set(w1[0].right), set(w2[0].right)) np_test.assert_array_equal(v1, v2)
def test_bow_create(self): data = ["A", "B", "A", "C", "A", "B"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) vectors = [enc.to_bow(w, self.sign_index) for w in windows] self.assertEqual(len(vectors), len(windows))
def test_bow_ignore_order(self): data1 = ["A", "B"] data2 = ["B", "A"] for s1, s2 in data1, data2: self.sign_index.add(s1) self.sign_index.add(s2) windows1 = windows(data1, window_size=1) windows2 = windows(data2, window_size=1) v1 = enc.to_bow(windows1[0], self.sign_index) v2 = enc.to_bow(windows2[0], self.sign_index) np_test.assert_array_equal(v1, v2) np_test.assert_array_equal(v1, v2) a_ri = self.sign_index.get_ri("A") b_ri = self.sign_index.get_ri("B") np_test.assert_array_equal(v1 - a_ri.to_vector(), b_ri.to_vector())
def test_bow_normalise(self): data = ["A", "A"] for s in data: self.sign_index.add(s) unique_str = set(data) self.assertEqual(len(self.sign_index), len(unique_str)) windows = windows(data, window_size=1) norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True) self.assertEqual(np.max(norm_bow),1) unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True) self.assertEqual(np.max(unorm_bow),2)
def text_to_ri(args): (fname, data_slice, window_size) = args input_hdf5 = h5py.File(fname, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] gen = subset_chunk_it(dataset, data_slice, chunk_size=250) pbar = tqdm(total=len(data_slice)) tokenizer = Tokenizer() pipe = WaCKyPipe(gen, tokenizer, filter_stop=False) global sign_index ri_vectors = dict() for tokens in pipe: # get sliding windows of given size s_windows = windows(tokens, window_size) # encode each window as a bag-of-words and add to occurrencies for window in s_windows: # pbar.write(str(window)) # lock.acquire() bow_vector = to_bow(window, sign_index) # lock.release() bow_vector = np_to_sparse(bow_vector) sign_id = sign_index.get_id(window.target) if sign_id not in ri_vectors: ri_vectors[sign_id] = bow_vector else: current_vector = ri_vectors[sign_id] ri_vectors[sign_id] = bow_vector + current_vector pbar.update(1) return ri_vectors
# ====================================================================================== try: # init model variables tf_session.run(var_init) def keep_token(token): fw = freq[sign_index.get_id(token)] p = ss_prob(fw, total_freq) if np.random.rand() < p: return False return True if subsampling: windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline) else: windows_stream = (windows(tokens, window_size) for tokens in pipeline) i = 0 x_samples = [] c_samples = [] for windows in tqdm(windows_stream, total=n_rows): if len(windows) > 0: # list of (target,ctx) for window in windows: target = sign_index.get_ri(window.target).to_vector() ctx = to_bow(window, sign_index, include_target=False, normalise=True) x_samples.append(target)
# =============================================== # TRAINING # =============================================== print(vocab_ids) x_samples = [] y_samples = [] losses = [] epochs = 100 with tf.Session() as ss: ss.run(init) #print(ss.nrp(normalized_embeddings)) for i, epoch in enumerate(repeat(sentences, epochs)): for sentence in epoch: windows = windows(sentence, window_size=1) for window in windows: labels = window.left + window.right target = window.target for label in labels: x_samples.append([vocab_ids[target]]) y_samples.append([vocab_ids[label]]) #print(np.asmatrix(y_samples)) _, current_loss = ss.run([train_step, loss], { in_placeholder: x_samples, label_placeholder: y_samples }) losses.append(current_loss)
from deepsign.data.corpora.pipe import BNCPipe from deepsign.rp.encode import to_bow from deepsign.rp.index import SignIndex, Generator from deepsign.data.iterators import chunk_it, windows home = os.getenv("HOME") data_dir = home + "/data/gold_standards/" corpus_file = data_dir + "bnc.hdf5" corpus_hdf5 = h5py.File(corpus_file, 'r') corpus_dataset = corpus_hdf5["sentences"] n_rows = 1000 sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000) pipeline = BNCPipe(datagen=sentences, lemmas=True) ri_gen = Generator(1000, 10) index = SignIndex(ri_gen) for s in tqdm(pipeline, total=n_rows): index.add_all(s) windows = windows(s, window_size=2) for window in windows: pass #words = window.left + window.right #ris = [index.get_ri(word).to_vector() for word in words] bow = to_bow(window, index, include_target=False, normalise=True)