forked from nOkuda/activetm
/
active_demo.py
122 lines (105 loc) · 3.85 KB
/
active_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Runs a demo of the active learning experiment"""
import argparse
import time
import datetime
import numpy
import scipy.sparse
import random
import ankura
from ankura import tokenize
from activetm.active import evaluate
from activetm.active import select
from activetm.tech.sampler import slda
SOTU_GLOB = '/net/roi/okuda/state_of_the_union/quarter/*'
ENGL_STOP = '/net/roi/okuda/data/stopwords.txt'
PIPELINE = [(ankura.read_glob, SOTU_GLOB, tokenize.simple),
(ankura.filter_stopwords, ENGL_STOP),
(ankura.filter_rarewords, 5),
(ankura.filter_commonwords, 1500),
(ankura.filter_smalldocs, 5),
(ankura.pregenerate_doc_tokens,)]
SOTU_LABELS = '/net/roi/okuda/state_of_the_union/ankura_quarter_timestamps.data'
CAND_SIZE = 500
SEED = 531
NUM_TOPICS = 20
ALPHA = 0.1
BETA = 0.01
VAR = 0.1
NUM_TRAIN = 5
NUM_SAMPLES_TRAIN = 5
TRAIN_BURN = 50
TRAIN_LAG = 50
NUM_SAMPLES_PREDICT = 5
PREDICT_BURN = 10
PREDICT_LAG = 5
START_LABELED = 50
END_LABELED = 100
LABEL_INCREMENT = 10
TEST_SIZE = 200
# SELECT_METHOD = select.factory['jsd_toptopic_centroid']
SELECT_METHOD = select.factory['random']
# SELECT_METHOD = select.factory['jsd_toptopic_balanced']
def demo(C_SEED):
"""Runs a demo of active learning simulation with sLDA via sampling"""
start = time.time()
rng = random.Random(SEED)
slda.set_seed(C_SEED)
dataset = ankura.run_pipeline(PIPELINE)
pre_labels = {}
with open(SOTU_LABELS) as ifh:
for line in ifh:
data = line.strip().split()
pre_labels[data[0]] = float(data[1])
labels = []
for doc_id in range(dataset.num_docs):
labels.append(pre_labels[dataset.titles[doc_id]])
end = time.time()
print('Import took:', datetime.timedelta(seconds=end-start))
print()
start = time.time()
# initialize sets
shuffled_doc_ids = list(range(dataset.num_docs))
rng.shuffle(shuffled_doc_ids)
test_doc_ids = shuffled_doc_ids[:TEST_SIZE]
test_labels = []
test_words = []
for t in test_doc_ids:
test_labels.append(labels[t])
test_words.append(dataset.doc_tokens(t))
test_labels_mean = numpy.mean(test_labels)
labeled_doc_ids = shuffled_doc_ids[TEST_SIZE:TEST_SIZE+START_LABELED]
known_labels = []
for t in labeled_doc_ids:
known_labels.append(labels[t])
unlabeled_doc_ids = set(shuffled_doc_ids[TEST_SIZE+START_LABELED:])
model = slda.SamplingSLDA(rng, NUM_TOPICS, ALPHA, BETA, VAR,
NUM_TRAIN, NUM_SAMPLES_TRAIN, TRAIN_BURN, TRAIN_LAG,
NUM_SAMPLES_PREDICT, PREDICT_BURN, PREDICT_LAG)
# learning loop
model.train(dataset, labeled_doc_ids, known_labels)
metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
print(len(labeled_doc_ids), metric,
datetime.timedelta(seconds=time.time()-start))
while len(labeled_doc_ids) < END_LABELED and len(unlabeled_doc_ids) > 0:
candidates = select.reservoir(list(unlabeled_doc_ids),
rng, CAND_SIZE)
chosen = SELECT_METHOD(dataset, labeled_doc_ids, candidates, model, rng, LABEL_INCREMENT)
for c in chosen:
known_labels.append(labels[c])
labeled_doc_ids.append(c)
unlabeled_doc_ids.remove(c)
model.train(dataset, labeled_doc_ids, known_labels, True)
metric = evaluate.pR2(model, test_words, test_labels, test_labels_mean)
print(len(labeled_doc_ids), metric,
datetime.timedelta(seconds=time.time()-start))
model.cleanup()
end = time.time()
print()
print('Total simulation time:', datetime.timedelta(seconds=end-start))
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('cseed', type=int, help='an integer for seeding the \
random number generator of the C code')
args = parser.parse_args()
demo(args.cseed)