def word_frequencies(args): """ :param fname: name of the hdf5 file containing the corpus :param data_slice: a range with the subset of the file to be read :return: a Counter with the frequency of the tokens found """ fname, data_slice, lemmatize = args input_hdf5 = h5py.File(fname, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] gen = subset_chunk_it(dataset, data_slice, chunk_size=100) pbar = tqdm(total=len(data_slice)) pipe = BNCPipe(gen, lemmas=lemmatize) freq = Counter() for tokens in pipe: #print(tokens) for token in tokens: normal_token = token.lower() freq[normal_token] += 1 pbar.update(1) input_hdf5.close() return freq
def test_subset_chunk_it(self): n_rows = 100 data = np.arange(n_rows) subset = range(50, 100) it = subset_chunk_it(data, subset, 4) for i in subset: data_j = next(it) self.assertEqual(data[i], data_j)
def text_to_ri(args): (fname, data_slice, window_size) = args input_hdf5 = h5py.File(fname, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] gen = subset_chunk_it(dataset, data_slice, chunk_size=250) pbar = tqdm(total=len(data_slice)) tokenizer = Tokenizer() pipe = WaCKyPipe(gen, tokenizer, filter_stop=False) global sign_index ri_vectors = dict() for tokens in pipe: # get sliding windows of given size s_windows = windows(tokens, window_size) # encode each window as a bag-of-words and add to occurrencies for window in s_windows: # pbar.write(str(window)) # lock.acquire() bow_vector = to_bow(window, sign_index) # lock.release() bow_vector = np_to_sparse(bow_vector) sign_id = sign_index.get_id(window.target) if sign_id not in ri_vectors: ri_vectors[sign_id] = bow_vector else: current_vector = ri_vectors[sign_id] ri_vectors[sign_id] = bow_vector + current_vector pbar.update(1) return ri_vectors
else: pass #print("Received from %d: " % (source), data) print("All Done in root!") # ====================================================================================== # Slave Node # ====================================================================================== else: subset_slice = comm.recv(source=0) print("Node %d: Processing slice: " % comm.rank, str(subset_slice)) # open hdf5 file and get the dataset corpus_hdf5 = h5py.File(corpus_file, 'r') corpus_dataset = corpus_hdf5["sentences"] sentences = subset_chunk_it(corpus_dataset, subset_slice, chunk_size=1000) pipeline = BNCPipe(datagen=sentences, lemmas=True) for sentence in tqdm(pipeline, total=len(subset_slice)): #print("Node %d: " % comm.rank, sentence) pass dummy_results = np.arange(10, dtype='i') comm.send(dummy_results, dest=0, tag=Tags.RESULT) time.sleep(2) comm.send("Done", dest=0, tag=Tags.FINISHED) corpus_hdf5.close()