Example #1
0
def get_window_stream(pipeline):

    if subsampling:
        windows_stream = (windows(list(filter(keep_token, tokens)),
                                  window_size) for tokens in pipeline)
    else:
        windows_stream = (windows(tokens, window_size) for tokens in pipeline)

    return windows_stream
Example #2
0
    def test_bow_dir_create(self):
        data1 = ["A", "B", "C"]
        data2 = ["A", "C", "B"]

        for i in range(len(data1)):
            self.sign_index.add(data1[i])
            self.sign_index.add(data2[i])

        w1 = windows(data1, window_size=2)
        w2 = windows(data2, window_size=2)

        perm = self.perm_generator.matrix()
        v1 = enc.to_bow_dir(w1[0], sign_index=self.sign_index, perm_matrix=perm)
        v2 = enc.to_bow_dir(w2[0], sign_index=self.sign_index, perm_matrix=perm)

        self.assertSetEqual(set(w1[0].right), set(w2[0].right))
        np_test.assert_array_equal(v1, v2)
Example #3
0
    def test_bow_create(self):
        data = ["A", "B", "A", "C", "A", "B"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        vectors = [enc.to_bow(w, self.sign_index) for w in windows]
        self.assertEqual(len(vectors), len(windows))
Example #4
0
    def test_bow_ignore_order(self):
        data1 = ["A", "B"]
        data2 = ["B", "A"]

        for s1, s2 in data1, data2:
            self.sign_index.add(s1)
            self.sign_index.add(s2)

        windows1 = windows(data1, window_size=1)
        windows2 = windows(data2, window_size=1)

        v1 = enc.to_bow(windows1[0], self.sign_index)
        v2 = enc.to_bow(windows2[0], self.sign_index)

        np_test.assert_array_equal(v1, v2)
        np_test.assert_array_equal(v1, v2)

        a_ri = self.sign_index.get_ri("A")
        b_ri = self.sign_index.get_ri("B")

        np_test.assert_array_equal(v1 - a_ri.to_vector(),
                                   b_ri.to_vector())
Example #5
0
    def test_bow_normalise(self):
        data = ["A", "A"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True)
        self.assertEqual(np.max(norm_bow),1)


        unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True)
        self.assertEqual(np.max(unorm_bow),2)
Example #6
0
def text_to_ri(args):
    (fname, data_slice, window_size) = args

    input_hdf5 = h5py.File(fname, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    gen = subset_chunk_it(dataset, data_slice, chunk_size=250)

    pbar = tqdm(total=len(data_slice))

    tokenizer = Tokenizer()
    pipe = WaCKyPipe(gen, tokenizer, filter_stop=False)

    global sign_index
    ri_vectors = dict()

    for tokens in pipe:
        # get sliding windows of given size
        s_windows = windows(tokens, window_size)

        # encode each window as a bag-of-words and add to occurrencies
        for window in s_windows:
            # pbar.write(str(window))
            # lock.acquire()
            bow_vector = to_bow(window, sign_index)
            # lock.release()
            bow_vector = np_to_sparse(bow_vector)
            sign_id = sign_index.get_id(window.target)

            if sign_id not in ri_vectors:
                ri_vectors[sign_id] = bow_vector
            else:
                current_vector = ri_vectors[sign_id]
                ri_vectors[sign_id] = bow_vector + current_vector

        pbar.update(1)

    return ri_vectors
Example #7
0
# ======================================================================================
try:
    # init model variables
    tf_session.run(var_init)


    def keep_token(token):
        fw = freq[sign_index.get_id(token)]
        p = ss_prob(fw, total_freq)
        if np.random.rand() < p:
            return False
        return True


    if subsampling:
        windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline)
    else:
        windows_stream = (windows(tokens, window_size) for tokens in pipeline)

    i = 0
    x_samples = []
    c_samples = []

    for windows in tqdm(windows_stream, total=n_rows):
        if len(windows) > 0:
            # list of (target,ctx)
            for window in windows:
                target = sign_index.get_ri(window.target).to_vector()
                ctx = to_bow(window, sign_index, include_target=False, normalise=True)

                x_samples.append(target)
Example #8
0
# ===============================================
#                      TRAINING
# ===============================================
print(vocab_ids)
x_samples = []
y_samples = []

losses = []
epochs = 100
with tf.Session() as ss:
    ss.run(init)
    #print(ss.nrp(normalized_embeddings))
    for i, epoch in enumerate(repeat(sentences, epochs)):
        for sentence in epoch:
            windows = windows(sentence, window_size=1)
            for window in windows:
                labels = window.left + window.right
                target = window.target
                for label in labels:
                    x_samples.append([vocab_ids[target]])
                    y_samples.append([vocab_ids[label]])

                #print(np.asmatrix(y_samples))
                _, current_loss = ss.run([train_step, loss], {
                    in_placeholder: x_samples,
                    label_placeholder: y_samples
                })

                losses.append(current_loss)
Example #9
0
from deepsign.data.corpora.pipe import BNCPipe
from deepsign.rp.encode import to_bow
from deepsign.rp.index import SignIndex, Generator
from deepsign.data.iterators import chunk_it, windows

home = os.getenv("HOME")

data_dir = home + "/data/gold_standards/"
corpus_file = data_dir + "bnc.hdf5"

corpus_hdf5 = h5py.File(corpus_file, 'r')
corpus_dataset = corpus_hdf5["sentences"]

n_rows = 1000
sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000)
pipeline = BNCPipe(datagen=sentences, lemmas=True)

ri_gen = Generator(1000, 10)
index = SignIndex(ri_gen)

for s in tqdm(pipeline, total=n_rows):
    index.add_all(s)

    windows = windows(s, window_size=2)

    for window in windows:
        pass
        #words = window.left + window.right
        #ris = [index.get_ri(word).to_vector() for word in words]
        bow = to_bow(window, index, include_target=False, normalise=True)