-
Notifications
You must be signed in to change notification settings - Fork 0
/
docs2LDAmodel.py
101 lines (84 loc) · 3.31 KB
/
docs2LDAmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
calls someBrandFiltering to get dict of
Brand ->list of threads associated
then computes dict
flters dit using stop list from files
should save the postlist to out folder for control?
builds corpus
runs LDA
saves corpus, lda model and dict ans alldocs
"""
from __future__ import print_function
import gensim
import logging
import pickle
import time
import genSimLDAlib as gslib
import someBrandFiltering as bf
def main(outdir = r'Z:\ermunds\results\2005 20t unbranded',
num_passes=2,
n_repeat = 10,
num_topics=20,
threadChoseStr='',
modelTag='2005+',
time_low_cutoff=time.strptime("1 Jan 2005", "%d %b %Y"),
time_hi_cutoff=time.strptime("1 Jan 2006", "%d %b %Y"),
):
'''
# time_low_cutoff, time_hi_cutoff posts are chosen between these two dates
# threadChoseStr - filter topic names by this phrase
'''
dTr = bf.notMain(threadChoseStr)
modelName = modelTag+str(num_topics)+'topics'
dirs =gslib.LDAdirs(modelName,outdir)
with open(dirs.dataFileName,'a') as file1: pickle.dump(dTr,file1)
## setup logging to file and console
logger = logging.getLogger('')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(dirs.logFileName)
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)-12s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
## get threads, extract post texts and save to single file
# 7min per 1GB
logging.log(logging.INFO,"building doc list")
lineCounter=0
with open(dirs.allDocsFileName,'a') as docDumpFile:
for Trlist in dTr.values():
for Tr in Trlist:
for p in Tr.getPosts():
if (p.msgTime>time_low_cutoff) and (p.msgTime<time_hi_cutoff):
doc = gslib.textCleanUp(p.msgTitle)+gslib.textCleanUp(p.msgText)
lineCounter+=1
print(doc,file = docDumpFile)
logging.log(logging.INFO,"total {} docs ".format(lineCounter))
#build dict 1.5H/GB
dict1 = gslib.build_dict(dirs)
dict1.save(dirs.dictFileName)
#dict1 = gensim.corpora.dictionary.Dictionary().load(dirs.dictFileName)
#pipe docfile to gensim corpus
#fixme - corpusAdapter missing a len() property
corpus = gslib.corpusAdapter(dirs.allDocsFileName,id2word=dict1)
gensim.corpora.MmCorpus.serialize(fname=dirs.corpusFname, corpus=corpus, id2word=dict1)
mm=gensim.corpora.MmCorpus(dirs.corpusFname)
## run the LDA (2h per update on 2M posts)
# first runs a small step and then update 9 times saving results to disk every time
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dict1, num_topics=num_topics, update_every=0, passes=num_passes)
lda.save(dirs.modelFname+"_0")
for i in xrange(n_repeat-1):
lda.update(mm);
# save inremediate result
lda.save(dirs.modelFname+"_"+str(i+1));
for t in lda.show_topics(-1):
logging.info(str('all topics here')+t);
lda.save(dirs.modelFname)
logger.removeHandler(ch)
logger.removeHandler(fh)
return modelName
if __name__ == '__main__':
main()