Example #1
0
def word_frequencies(args):
    """
    :param fname: name of the hdf5 file containing the corpus
    :param data_slice: a range with the subset of the file to be read
    :return: a Counter with the frequency of the tokens found
    """
    fname, data_slice, lemmatize = args
    input_hdf5 = h5py.File(fname, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    gen = subset_chunk_it(dataset, data_slice, chunk_size=100)

    pbar = tqdm(total=len(data_slice))

    pipe = BNCPipe(gen, lemmas=lemmatize)
    freq = Counter()

    for tokens in pipe:
        #print(tokens)
        for token in tokens:
            normal_token = token.lower()
            freq[normal_token] += 1
        pbar.update(1)

    input_hdf5.close()
    return freq
Example #2
0
    def test_subset_chunk_it(self):
        n_rows = 100

        data = np.arange(n_rows)
        subset = range(50, 100)

        it = subset_chunk_it(data, subset, 4)

        for i in subset:
            data_j = next(it)
            self.assertEqual(data[i], data_j)
Example #3
0
def text_to_ri(args):
    (fname, data_slice, window_size) = args

    input_hdf5 = h5py.File(fname, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    gen = subset_chunk_it(dataset, data_slice, chunk_size=250)

    pbar = tqdm(total=len(data_slice))

    tokenizer = Tokenizer()
    pipe = WaCKyPipe(gen, tokenizer, filter_stop=False)

    global sign_index
    ri_vectors = dict()

    for tokens in pipe:
        # get sliding windows of given size
        s_windows = windows(tokens, window_size)

        # encode each window as a bag-of-words and add to occurrencies
        for window in s_windows:
            # pbar.write(str(window))
            # lock.acquire()
            bow_vector = to_bow(window, sign_index)
            # lock.release()
            bow_vector = np_to_sparse(bow_vector)
            sign_id = sign_index.get_id(window.target)

            if sign_id not in ri_vectors:
                ri_vectors[sign_id] = bow_vector
            else:
                current_vector = ri_vectors[sign_id]
                ri_vectors[sign_id] = bow_vector + current_vector

        pbar.update(1)

    return ri_vectors
Example #4
0
        else:
            pass
            #print("Received from %d: " % (source), data)

    print("All Done in root!")

# ======================================================================================
# Slave Node
# ======================================================================================
else:
    subset_slice = comm.recv(source=0)
    print("Node %d: Processing slice: " % comm.rank, str(subset_slice))

    # open hdf5 file and get the dataset
    corpus_hdf5 = h5py.File(corpus_file, 'r')
    corpus_dataset = corpus_hdf5["sentences"]

    sentences = subset_chunk_it(corpus_dataset, subset_slice, chunk_size=1000)
    pipeline = BNCPipe(datagen=sentences, lemmas=True)

    for sentence in tqdm(pipeline, total=len(subset_slice)):
        #print("Node %d: " % comm.rank, sentence)
        pass

    dummy_results = np.arange(10, dtype='i')
    comm.send(dummy_results, dest=0, tag=Tags.RESULT)
    time.sleep(2)
    comm.send("Done", dest=0, tag=Tags.FINISHED)

    corpus_hdf5.close()