-
Notifications
You must be signed in to change notification settings - Fork 0
/
ldaTrain.py
32 lines (24 loc) · 901 Bytes
/
ldaTrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__author__ = 'thomas'
import os
from corpus.textExtractor import TextExtractor
from corpus.bowBuilder import BowBuilder
from lda.ldaCalc import LdaCalc
cache_root = os.path.join(os.getcwd(), "cache")
# start here!
# get text
print("\nextracting text from corpus ------------------------------------\n")
extractor = TextExtractor(cache_root)
extractor.get_texts()
# extractor.save()
# convert to BOW vectors
print("\nbuilding BOW vectors from corpus ------------------------------------\n")
bowBuilder = BowBuilder(docs=extractor.texts, cache_dir=cache_root)
bowBuilder.generate_bows()
bowBuilder.save()
# run the LDA
print("\ntraining LDA model -----------------------------------------------\n")
lda = LdaCalc(bowBuilder.bowVectorCorpus, bowBuilder.id2word, cache_root)
lda.run_lda()
print("\nsaving LDA model -----------------------------------------------\n")
lda.save()
lda.print_topics()