def build(self, corpus_path, tokenizer, workers_num=1, min_count=1): """ Build vocabulary from the given corpus. """ print("Start %d workers for building vocabulary..." % workers_num) lines_num = count_lines(corpus_path) pool = Pool(workers_num) vocab_list = [] for i in range(workers_num): start = i * lines_num // workers_num end = (i+1) * lines_num // workers_num vocab_list.append((pool.apply_async(func=self.worker, args=[corpus_path, tokenizer, start, end]))) pool.close() pool.join() # Union vocab in all workers. w2i, i2w, w2c = self.union(vocab_list) # Sort w2c according to word count. sorted_w2c = sorted(w2c.items(), key=lambda item:item[1], reverse=True) # Add special symbols and remove low frequency words. with open(self.reserved_vocab_path, mode="r", encoding="utf-8") as reader: self.i2w = [line.strip().split()[0] for line in reader] for i, w in enumerate(self.i2w): self.w2i[w] = i self.w2c[w] = -1 for w, c in sorted_w2c: if c < min_count: break if w not in self.w2i: self.w2i[w], self.w2c[w] = len(self.i2w), c self.i2w.append(w)
def build_and_save(self, workers_num): """ Build dataset from the given corpus. Start workers_num processes and each process deals with a part of data. """ lines_num = count_lines(self.corpus_path) print("Starting %d workers for building datasets ... " % workers_num) assert (workers_num >= 1) if workers_num == 1: self.worker(0, 0, lines_num) else: pool = Pool(workers_num) for i in range(workers_num): start = i * lines_num // workers_num end = (i + 1) * lines_num // workers_num pool.apply_async(func=self.worker, args=[i, start, end]) pool.close() pool.join() # Merge datasets. merge_dataset(self.dataset_path, workers_num)