def test_corpus_pass_tmpreproc(): c = Corpus() c['doc1'] = 'A simple example in simple English.' c['doc2'] = 'It contains only three very simple documents.' c['doc3'] = 'Simply written documents are very brief.' preproc = TMPreproc(c) tok = preproc.tokenize().tokens assert set(tok.keys()) == set(c.keys()) assert len(tok['doc1']) == 7
import pandas as pd from tmtoolkit.preprocess import TMPreproc from tmtoolkit.utils import pickle_data if __name__ == '__main__': # this is necessary for multiprocessing on Windows! corpus = { u'doc1': u'Ein einfaches Beispiel in einfachem Deutsch.', u'doc2': u'Es enthält nur drei sehr einfache Dokumente.', u'doc3': u'Die Dokumente sind sehr kurz.', } preproc = TMPreproc(corpus, language='german') print('tokenized:') preproc.tokenize() pprint(preproc.tokens) # preproc.stem() # pprint(preproc.tokens) print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags) print('lowercase:') preproc.tokens_to_lowercase()
print("-----") corpus.split_by_paragraphs() print("documents split into paragraphs") print(corpus.docs.keys()) print("-----") print("first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(corpus.docs[doclabel]) print("-----") preproc = TMPreproc(corpus.docs, language=u'german') preproc.tokenize().tokens_to_lowercase() print("tokenized first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(preproc.tokens[doclabel]) preproc.generate_ngrams(2, join=False).use_ngrams_as_tokens(join=True) print("bigrams from first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(preproc.tokens[doclabel])
Script that generates "eval_table/eval_table.csv" from text samples in folder "eval_texts". This table is later used to manually add correct lemmata. Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung January 2019 """ import pandas as pd from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc corpus = Corpus.from_folder('eval_texts') preproc = TMPreproc(corpus.docs, language='german') postagged = preproc.tokenize().pos_tag() postagged = postagged.filter_for_pos({'N', 'V', 'ADJ', 'ADV'}) tok_pos_df = pd.DataFrame() for doc_id, tok_pos in postagged.tokens_with_pos_tags.items(): tok, pos = zip(*tok_pos) tok_pos_df = tok_pos_df.append(pd.DataFrame({ 'doc_id': doc_id, 'token': tok, 'pos': pos }), ignore_index=True) tok_pos_df.drop_duplicates(['token', 'pos'], inplace=True) tok_pos_df.to_csv('eval_table/eval_table.csv')
from tmtoolkit.topicmod.visualize import plot_eval_results logging.basicConfig(level=logging.INFO) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True print('loading data...') bt18 = pd.read_pickle('data/bt18_sample_1000.pickle') print('loaded %d documents' % len(bt18)) doc_labels = [u'%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)] print('preprocessing data...') bt18corp = Corpus(dict(zip(doc_labels, bt18.text))) preproc = TMPreproc(bt18corp, language='german') preproc.tokenize().stem().clean_tokens() doc_labels = list(preproc.tokens.keys()) texts = list(preproc.tokens.values()) print('creating gensim corpus...') gnsm_dict = gensim.corpora.Dictionary.from_documents(texts) gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts] # evaluate topic models with different parameters const_params = dict(update_every=0, passes=10) ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] print('evaluating %d topic models' % len(varying_params)) eval_results = tm_gensim.evaluate_topic_models(
""" from pprint import pprint from tmtoolkit.preprocess import TMPreproc import pandas as pd if __name__ == '__main__': # this is necessary for multiprocessing on Windows! corpus = { 'doc1': u'A simple example in simple English.', 'doc2': u'It contains only three very simple documents.', 'doc3': u'Simply written documents are very brief.', } preproc = TMPreproc(corpus, language='english') print('input corpus:') pprint(corpus) print('running preprocessing pipeline...') preproc.tokenize().pos_tag().lemmatize().tokens_to_lowercase().clean_tokens() print('final tokens:') pprint(preproc.tokens) print('DTM:') doc_labels, vocab, dtm = preproc.get_dtm() # using pandas just for a nice tabular output print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))