forked from ajbc/lda-svi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_to_tmv_db.py
executable file
·102 lines (79 loc) · 2.73 KB
/
process_to_tmv_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/python
# Copyright (C) 2014 Allison Chaney
import cPickle, sys
from os.path import join
import onlineldavb
import generalrandom
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description = \
'Fit LDA to a set of documents with online VB.')
parser.add_argument('fit_path', type=str, \
help = 'path to the fit directory')
parser.add_argument('tmv_path', type=str, \
help = 'path to tmv source template (e.g. \'../tmv/BasicBrowser\')')
# parse the arguments
args = parser.parse_args()
sys.path.append(args.tmv_path)
import db
# load model settings: vocab, K, docgen
print "loading model settings"
f = open(join(args.fit_path, 'settings.pickle'))
vocab, K, docgen = cPickle.load(f)
f.close()
# load model itself, the olda object
print "loading model"
f = open(join(args.fit_path, 'olda.pickle'))
olda = cPickle.load(f)
f.close()
# Add terms and topics to the DB
print "initializing db"
db.init()
print "adding vocab terms"
db.add_terms(vocab)
print "adding",K,"topics"
db.add_topics(K)
# write out the final topics to the db
print "writing out final topics to tmv db"
for topic in range(len(olda._lambda)):
topic_terms_array = []
lambda_sum = sum(olda._lambda[topic])
for term in range(len(olda._lambda[topic])):
topic_terms_array.append((term, \
olda._lambda[topic][term]/lambda_sum))
db.update_topic_terms(topic, topic_terms_array)
# do a final pass over all documents
print "doing a final E step over all documents"
per_time = dict()
i = 0
import time
s = time.time()
D = 1850000 #TODO: this should be read in from settings
for filename, alltxt, title, subtitle in docgen:
length = 0
for word in alltxt.split():
if word in vocab:
length += 1
# TODO: this should be done less hackishly
t = int(filename.split('/')[6])
if length == 0:
continue
db.add_doc(title, subtitle, length, filename, t)
(gamma, ss) = olda.do_e_step(alltxt)
if t not in per_time:
per_time[t] = ss
else:
per_time[t] += ss
db.add_doc_topics(filename, gamma.tolist()[0])
if i % 100 == 0:
tn = (time.time() - s) / 3600
rem = D - i
time_rem = rem * (tn) / (i+1)
print "doc %d (%d)" % (i, t), tn, str(time_rem)+'h', \
str(time_rem/24)+'d'
i += 1
# slice up topics by time
print "calculating time-slice topics"
for t in per_time:
per_time[t] += olda._eta
db.add_time_topics(t, per_time[t])