preproc = TMPreproc(corpus.docs, language=u'german') print('tokenizing...') preproc.tokenize() print('POS tagging...') preproc.pos_tag() print('lemmatization...') preproc.lemmatize() print('lowercase transform...') preproc.tokens_to_lowercase() print('cleaning...') preproc.clean_tokens() proc_time = time.time() - start_time print('-- processing took %f sec. so far' % proc_time) preproc.save_state('data/read_preproc_lda_de_state.pickle') print('token samples:') for dl, tokens in preproc.tokens_with_pos_tags.items(): print("> %s:" % dl) print(">>", sample(tokens, 10)) print('generating DTM...') doc_labels, vocab, dtm = preproc.get_dtm() print("saving DTM data to pickle file '%s'..." % DTM_PICKLE) save_dtm_to_pickle(dtm, vocab, doc_labels, DTM_PICKLE) print("running LDA...") model = lda.LDA(n_topics=30, n_iter=500) model.fit(dtm)
preproc.expand_compound_tokens() add_timing('expand_compound_tokens') preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize') preproc_copy = preproc.copy() preproc_copy.shutdown_workers() del preproc_copy add_timing('copy') _, statepickle = mkstemp('.pickle') preproc.save_state(statepickle) add_timing('save_state') preproc_copy = TMPreproc.from_state(statepickle) preproc_copy.shutdown_workers() del preproc_copy add_timing('from_state') preproc_copy = TMPreproc.from_tokens(preproc.tokens_with_metadata, language='en') preproc_copy.shutdown_workers() del preproc_copy add_timing('from_tokens') preproc_copy = TMPreproc.from_tokens_datatable(preproc.tokens_datatable, language='en')