Example #1
0
    def test_chunk_it(self):
        n_rows = 100

        data = np.arange(n_rows)
        it = chunk_it(data, n_rows, 3)

        for i in range(len(data)):
            data_j = next(it)
            self.assertEqual(data[i], data_j)
Example #2
0
def build_vocabulary(corpus_file, output_file=None, max_sentences=0):
    input_hdf5 = h5py.File(corpus_file, 'r')
    #dataset_name = "sentences_lemmatised"
    dataset_name = "sentences"

    dataset = input_hdf5[dataset_name]

    if max_sentences > 0:
        num_sentences = min(max_sentences, len(dataset))
    else:
        num_sentences = len(dataset)

    gen = chunk_it(dataset, num_sentences, chunk_size=250)
    tokenizer = Tokenizer()
    pipe = WaCKyPipe(gen, tokenizer, filter_stop=False)
    freq = Counter()

    for tokens in tqdm(pipe, total=num_sentences):
        #tqdm.write(str(tokens))
        for token in tokens:
            normal_token = token.lower()
            freq[normal_token] += 1

    input_hdf5.close()
    tqdm.write("{0} unique words".format(len(freq)))
    # order by frequency
    freq = freq.most_common()

    for i in range(10):
        (w, f) = freq[i]
        print("{0}:{1}".format(w, f))

    if output_file is not None:
        output_hdf5 = h5py.File(output_file, 'w')
        word_ids = range(len(freq))

        # encode explicitly so that hdf5 can take an array of variable length strings and store it
        # the hdf5 needs to store variable-length strings with a specific encoding (UTF-8 in this case)
        vocabulary = np.array(
            [freq[i][0].encode("utf8") for i in range(len(freq))])

        dt = h5py.special_dtype(vlen=str)
        output_hdf5.create_dataset("vocabulary",
                                   data=vocabulary,
                                   dtype=dt,
                                   compression="gzip")
        print("vocabulary written")

        freq = np.array([freq[i][1] for i in range(len(freq))])
        output_hdf5.create_dataset("frequencies",
                                   data=freq,
                                   compression="gzip")
        print("frequencies written")

        output_hdf5.close()
        print("done")
Example #3
0
    def test_repeat_fn_exhaust(self):
        n_samples = 4
        repeat = 2
        v = np.random.uniform(0, 1, [n_samples, 1])
        data_it = chunk_it(v, chunk_size=2)

        def it_fn(x): return iter(x)

        # data it will get exhausted so it will not repeat
        data_it = repeat_apply(it_fn, data_it, repeat)

        # only return 4 items
        self.assertEqual(len(list(data_it)), n_samples)
Example #4
0
    def test_reat_chunk_it(self):
        n_samples = 4
        repeat = 2
        v = np.random.uniform(0, 1, [n_samples, 1])
        data_it = chunk_it(v, chunk_size=2)

        def chunk_fn(x): return chunk_it(x, chunk_size=2)

        # for chunk in data_it:
        #    print(chunk)
        # print(data_it)
        data_it = repeat_apply(chunk_fn, v, repeat)

        self.assertEqual(len(list(data_it)), n_samples * repeat)
Example #5
0
    def test_chain_shuffle(self):
        n_samples = 4
        repeat = 2
        v = np.arange(0, n_samples, 1)
        data_it = chunk_it(v, chunk_size=2)

        def chunk_fn(x): return chunk_it(x, chunk_size=2)

        # first chain is normal, second is shuffled from the two repetitions
        data_it = repeat_apply(chunk_fn, v, repeat)

        data_it = chain_it(data_it, shuffle_it(repeat_apply(chunk_fn, v, repeat), buffer_size=8))

        data = list(data_it)

        unique_data = np.unique(data)
        counts = np.unique(np.bincount(data))

        self.assertEqual(len(unique_data), 4)
        self.assertEqual(len(counts), 1)
        self.assertEqual(counts[0], 4)
Example #6
0
    def test_batch_it(self):
        num_samples = 6
        v = np.random.uniform(-1, 1, [num_samples, 2])
        padding = np.zeros([2])

        c_it = chunk_it(v, 6, chunk_size=3)
        print(v)

        batch_size = 4
        b_it = batch_it(c_it, batch_size, padding=True, padding_elem=padding)

        for b in b_it:
            self.assertEqual(len(b), batch_size)
            print(np.array(b))

        b_it = batch_it(v, batch_size)
        last_batch = None
        try:
            for b in b_it:
                last_batch = b
                self.assertEqual(len(b), batch_size)

        except AssertionError:
            self.assertEqual(len(last_batch), num_samples % batch_size)
Example #7
0
 def chunk_fn(x):
     return chunk_it(x, chunk_size=batch_size * 1000)
Example #8
0
model_file = result_dir + "model_bnc"


# ======================================================================================
# Load Corpus
# ======================================================================================
data_dir = home + "/data/gold_standards/"
corpus_file = data_dir + "wacky_6M.hdf5"

corpus_hdf5 = h5py.File(corpus_file, 'r')
corpus_dataset = corpus_hdf5["sentences"]
# iterates over lines but loads them as chunks
#n_rows = 100000
#sentences = chunk_it(corpus_dataset,n_rows=n_rows, chunk_size=20000)
n_rows = len(corpus_dataset)
sentences = chunk_it(corpus_dataset, chunk_size=100000)

pipeline = WaCKyPipe(datagen=sentences)
# ======================================================================================
# Load Vocabulary
# ======================================================================================
vocab_file = data_dir + "wacky_vocab_6M_spacy.hdf5"
vocab_hdf5 = h5py.File(vocab_file, 'r')

ri_gen = Generator(dim=k, num_active=s)
print("Loading Vocabulary...")
sign_index = TrieSignIndex(ri_gen, list(vocab_hdf5["vocabulary"][:]), pregen_indexes=False)

if subsampling:
    freq = TrieSignIndex.map_frequencies(list(vocab_hdf5["vocabulary"][:]),
                                         list(vocab_hdf5["frequencies"][:]),
Example #9
0
    return True


def get_window_stream(pipeline):

    if subsampling:
        windows_stream = (windows(list(filter(keep_token, tokens)),
                                  window_size) for tokens in pipeline)
    else:
        windows_stream = (windows(tokens, window_size) for tokens in pipeline)

    return windows_stream


try:
    sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000)
    pipeline = BNCPipe(datagen=sentences, lemmas=args.lemmas)

    for epoch in range(args.epochs):
        print("epoch ", epoch + 1)
        i = 0
        x_samples = []
        y_samples = []

        # restart sentence iterator
        sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=10000)
        pipeline.reaload(sentences)
        window_stream = get_window_stream(pipeline)

        for windows in tqdm(window_stream, total=n_rows):
            if len(windows) > 0:
def get_ngrams():
    for ngram in chunk_it(data, chunk_size=batch_size * 100):
        yield ngram
Example #11
0
        def chunk_fn(x): return chunk_it(x, chunk_size=2)

        # first chain is normal, second is shuffled from the two repetitions
        data_it = repeat_apply(chunk_fn, v, repeat)
Example #12
0
        def chunk_fn(x): return chunk_it(x, chunk_size=2)

        # for chunk in data_it:
        #    print(chunk)
        # print(data_it)
        data_it = repeat_apply(chunk_fn, v, repeat)