def main(): args = parser.parse_args() corpus = Corpus.connect_to(args.filename) min_rowid = args.min_rowid if args.min_rowid is not None else 1 max_rowid = int(max_rowid) if args.max_rowid is not None else len(corpus) assert min_rowid <= max_rowid dest_filename = Path('vector-corpus-{}.sqlite3'.format(os.getpid())) assert not dest_filename.exists(), dest_filename destination = CondensedCorpus.connect_to(dest_filename) # Insert every file in the given subset. files = corpus.iterate(min_rowid=min_rowid, max_rowid=max_rowid, with_hash=True) progress_bar = tqdm(files, initial=min_rowid, total=max_rowid) for file_hash, tokens in progress_bar: try: if len(tokens) == 0: logging.warn('Skipping empty file: %s', file_hash) else: progress_bar.set_description('Processing %s' % (file_hash, )) destination.insert(file_hash, tokens) except KeyboardInterrupt: logging.exception('Last file before interrupt: %s', file_hash) break
def main(len=len): _, filename = sys.argv corpus = Corpus.connect_to(filename) total = len(corpus) array = np.empty(total, dtype=np.uint32) MAX = 2**32 - 1 for i, tokens in enumerate(tqdm(corpus, total=total)): n_tokens = len(tokens) assert n_tokens <= MAX array[i] = n_tokens del tokens np.save('token_lengths', array)
size = t.green(str(size)) elif 128 <= size < 256: size = t.yellow(str(size)) else: size = t.red(str(size)) print("The size of vocabulary is", size) total_vocab = [START_TOKEN] + sorted(list(vocab)) + [END_TOKEN] if not write_to_file: return filename = 'autogenerated_vocabulary.py' with open(filename, 'wt', encoding='utf-8') as vocab_file: vocab_file.write('VOCAB = ') pprint(total_vocab, stream=vocab_file) if __name__ == '__main__': import sys _, filename = sys.argv corpus = Corpus.connect_to(filename) vocab = set() for file_tokens in tqdm(corpus, total=len(corpus)): for token in file_tokens: vocab.add(stringify_token(token)) summarize(vocab)