def main():
    args = parser.parse_args()
    corpus = Corpus.connect_to(args.filename)

    min_rowid = args.min_rowid if args.min_rowid is not None else 1
    max_rowid = int(max_rowid) if args.max_rowid is not None else len(corpus)
    assert min_rowid <= max_rowid

    dest_filename = Path('vector-corpus-{}.sqlite3'.format(os.getpid()))
    assert not dest_filename.exists(), dest_filename
    destination = CondensedCorpus.connect_to(dest_filename)

    # Insert every file in the given subset.
    files = corpus.iterate(min_rowid=min_rowid,
                           max_rowid=max_rowid,
                           with_hash=True)
    progress_bar = tqdm(files, initial=min_rowid, total=max_rowid)

    for file_hash, tokens in progress_bar:
        try:
            if len(tokens) == 0:
                logging.warn('Skipping empty file: %s', file_hash)
            else:
                progress_bar.set_description('Processing %s' % (file_hash, ))
                destination.insert(file_hash, tokens)
        except KeyboardInterrupt:
            logging.exception('Last file before interrupt: %s', file_hash)
            break
Exemple #2
0
def main(len=len):
    _, filename = sys.argv
    corpus = Corpus.connect_to(filename)

    total = len(corpus)
    array = np.empty(total, dtype=np.uint32)

    MAX = 2**32 - 1

    for i, tokens in enumerate(tqdm(corpus, total=total)):
        n_tokens = len(tokens)
        assert n_tokens <= MAX
        array[i] = n_tokens
        del tokens

    np.save('token_lengths', array)
Exemple #3
0
        size = t.green(str(size))
    elif 128 <= size < 256:
        size = t.yellow(str(size))
    else:
        size = t.red(str(size))

    print("The size of vocabulary is", size)
    total_vocab = [START_TOKEN] + sorted(list(vocab)) + [END_TOKEN]

    if not write_to_file:
        return

    filename = 'autogenerated_vocabulary.py'
    with open(filename, 'wt', encoding='utf-8') as vocab_file:
        vocab_file.write('VOCAB = ')
        pprint(total_vocab, stream=vocab_file)


if __name__ == '__main__':
    import sys
    _, filename = sys.argv
    corpus = Corpus.connect_to(filename)

    vocab = set()

    for file_tokens in tqdm(corpus, total=len(corpus)):
        for token in file_tokens:
            vocab.add(stringify_token(token))

    summarize(vocab)