forked from pesoto/ActiveLearningAdditions
/
learner.py
executable file
·336 lines (298 loc) · 14.3 KB
/
learner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
'''
###############################################
learner.py
Byron C Wallace
Tufts Medical Center
Ammendments made by:
Paul E. Soto
Universitat Pompeu Fabra
This module represents a learner. Includes active learning.
###############################################
'''
import pdb
import random
import svm
from svm import *
import machine_learning
import pandas as pd
import svmc
import numpy as np
def evaluate_learner(learner, include_labeled_data_in_metrics=True):
'''
Returns a dictionary containing various metrics for learner performance, as measured over the
examples in the unlabeled_datasets belonging to the learner.
'''
results = {}
# first we count the number of true positives and true negatives discovered in learning. this is so we do not
# unfairly penalize active learning strategies for finding lots of the minority class during training.
if include_labeled_data_in_metrics:
tps = learner.labeled_datasets.number_of_minority_examples()
tns = learner.labeled_datasets.number_of_majority_examples()
else:
tps = 0
tns = 0
results["npos"] = tps
print "positives found during learning: %s\nnegatives found during learning: %s" % (tps, tns)
print "evaluating learner over %s instances." % learner.test_datasets.data.shape[0]
fps, fns = 0, 0
# get the raw points out for prediction
point_sets = [learner.test_datasets.get_samples().values]
# the labels are assumed to be the same; thus we only use the labels for the first dataset
true_labels = learner.test_datasets.get_labels().values
if true_labels ==[]:
return {}
# loop over all of the examples, and feed to the "cautious_classify" method
# the corresponding point in each feature-space
predictions = []
if learner.nbc:
ml_class = machine_learning.NaiveBayes(learner.labeled_datasets.data,1,learner.labeled_datasets.classLabel)
ml_class.testing = learner.test_datasets.data.drop(learner.test_datasets.origText,1)
ml_class.predictProbabilities('Gaussian')
ml_class.getPredictions()
predictions = ml_class.bestLabel
probs = ml_class.testingProbs
scores = 'The training is done using Naive Bayes'
elif learner.models[0].probability:
probs = []
for example_index in range(len(point_sets[0])):
prediction = learner.models[0].predict_probability(point_sets[0][example_index])
probs.append(prediction)
predictions.append(prediction[0])
scores = 'This is a probability model'
else:
scores = []
for example_index in range(len(point_sets[0])):
prediction = learner.cautious_predict([point_sets[feature_space_index][example_index] for feature_space_index in range(len(point_sets))])
predictions.append(prediction)
score = learner.models[0].predict_values(point_sets[0][example_index])
scores.append(score)
probs = "This is not a probability model"
conf_mat = svm.evaluate_predictions(predictions, true_labels)
#
# evaluate_predictions does not include the instances found during training!
#
conf_mat["tp"]+= tps
conf_mat["tn"]+= tns
print "confusion matrix:"
print conf_mat
results['probabilities'] = probs
results['scores'] = scores
results["confusion_matrix"] = conf_mat
results["accuracy"] = float (conf_mat["tp"] + conf_mat["tn"]) / float(sum([conf_mat[key] for key in conf_mat.keys()]))
if float(conf_mat["tp"]) == 0:
results["sensitivity"] = 0
else:
results["sensitivity"] = float(conf_mat["tp"]) / float(conf_mat["tp"] + conf_mat["fn"])
return results
def get_model_params(model):
"""
Extract the alpha and b parameters from the SVM model.
returns (alpha, b)
"""
rho = svmc.svm_get_model_rho(model.model)
n = svmc.svm_get_model_num_coefs(model.model)
coefs_dblarr = svmc.new_double(n)
perm_intarr = svmc.new_int(n)
try:
svmc.svm_get_model_coefs(model.model,coefs_dblarr)
svmc.svm_get_model_perm(model.model,perm_intarr)
coefs = np.zeros(n,dtype=float)
perm = np.zeros(n,dtype=int)
for i in range(n):
coefs[i] = svmc.double_getitem(coefs_dblarr,i)
perm[i] = svmc.int_getitem(perm_intarr,i)
finally:
svmc.delete_double(coefs_dblarr)
svmc.delete_int(perm_intarr)
return (coefs, perm, rho)
class learner:
def __init__(self, unlabeled_datasets = pd.DataFrame(), test_datasets = pd.DataFrame(),models=None,probability = 0,NBC=False):
# just using default parameter for now
self.params = svm_parameter(weight=[1, 1000],probability=probability)
self.unlabeled_datasets = unlabeled_datasets
self.test_datasets = test_datasets
# initialize empty labeled datasets (i.e., all data is unlabeled to begin with)
self.labeled_datasets = machine_learning.ActiveLearningDataset(pd.DataFrame(columns=unlabeled_datasets.data.columns))
self.models = models
self.test_results = []
self.nbc = NBC
def active_learn(self, num_examples_to_label, query_function = None, num_to_label_at_each_iteration=10, rebuild_models_at_each_iter=True):
''''
Active learning loop. Uses the provided query function (query_function) to select a number of examples
(num_to_label_at_each_iteration) to label at each step, until the total number of examples requested
(num_examples_to_label) has been labeled. The models will be updated at each iteration.
'''
if not query_function:
query_function = self.SIMPLE
labeled_so_far = 0
while labeled_so_far < num_examples_to_label:
print "labeled %s out of %s" % (labeled_so_far, num_examples_to_label)
example_ids_to_label = query_function(num_to_label_at_each_iteration)
# now remove the selected examples from the unlabeled sets and put them in the labeled sets.
self.label_instances_in_all_datasets(example_ids_to_label)
if rebuild_models_at_each_iter:
self.rebuild_models()
print "models rebuilt with %s labeled examples" % self.labeled_datasets.data.shape[0]
labeled_so_far += num_to_label_at_each_iteration
if not rebuild_models_at_each_iter:
self.rebuild_models()
print "active learning loop completed; models rebuilt."
def label_instances_in_all_datasets(self, inst_ids):
'''
Removes the instances in inst_ids (a list of instance numbers to 'label') from the unlabeled dataset(s) and places
them in the labeled dataset(s). These will subsequently be used in training models, thus this simulates 'labeling'
the instances.
'''
to_label = self.unlabeled_datasets.remove_instances(inst_ids)
for instance in to_label.index:
print '-------------------------'
valid = False
while not valid:
try:
print to_label.loc[instance].origText
var = raw_input("Please enter label for the above point: \n"+
'Please choose from '+str(self.models[0].labels) + "\nLabel: ")
if eval(var) in self.models[0].labels:
to_label.loc[instance,self.unlabeled_datasets.classLabel] = eval(var)
valid = True
else:
print 'Please choose from '+str(self.models[0].labels)
except Exception as e:
print e
valid = False
print '-------------------------'
self.labeled_datasets.add_data(to_label)
def cautious_predict(self, X):
if self.models and len(self.models):
return max([m.predict(x) for m,x in zip(self.models, X)])
else:
raise Exception, "No models have been initialized."
def pick_initial_training_set(self, k, build_models=True):
'''
Select a set of training examples from the dataset(s) at random. This set will be used
to build the initial model. The **same training examples will be selected from each dataset.
'''
self.label_at_random(k)
if build_models:
print "building models..."
self.rebuild_models()
print "done."
def undersample_labeled_datasets(self, k=None):
'''
Undersamples the current labeled datasets
'''
if self.labeled_datasets.data.shape[0]>0:
if not k:
print "undersampling majority class to equal that of the minority examples"
k = self.labeled_datasets.number_of_majority_examples() - self.labeled_datasets.number_of_minority_examples()
# we copy the datasets rather than mutate the class members.
copied_dataset = machine_learning.ActiveLearningDataset(self.labeled_datasets.copy())
print "removing %s majority instances" % k
removed_instances = copied_dataset.undersample(k)
else:
raise Exception, "No labeled data has been provided!"
return copied_dataset
def label_maximally_diverse_set(self, k, label_one_initially=True):
'''
Returns the instance numbers for the k most diverse examples (selected greedily)
'''
# first, label one example at random
if label_one_initially:
self.label_at_random(1)
self.rebuild_models()
# just use the first dataset for now....
# TODO implement coin flip, etc
model = self.models[0]
# diversity function
div_function = lambda x: sum([model.compute_cos_between_examples(x, y) for y in self.labeled_datasets.get_samples().values])
#for x in self.unlabeled_datasets[0].instances[:k]:
for step in range(k-1):
if not step%100:
print "on step %s" % k
# add examples iteratively, selecting the most diverse w.r.t. to the examples already selected in each step
# first compute diversity scores for all unlabeled instances
x = self.unlabeled_datasets.data.index[0]
most_diverse_id = x
most_diverse_score = div_function(self.unlabeled_datasets.get_samples().loc[x].values)
for x in self.unlabeled_datasets.data.index[1:]:
# now iterate over the remaining unlabeled examples
cur_div_score = div_function(self.unlabeled_datasets.get_samples().loc[x].values)
if cur_div_score > most_diverse_score:
most_diverse_score = cur_div_score
most_diverse_id = x
# now label the most diverse example
self.label_instances_in_all_datasets([most_diverse_id])
print "building models..."
self.rebuild_models()
print "done."
def label_at_random(self, k):
'''
Select and 'label' a set of k examples from the (unlabeled) dataset(s) at random.
'''
if len(self.unlabeled_datasets.data)>0:
# remove a random subset of instances from one of our datasets (it doesn't matter which one)
removed_instances = self.unlabeled_datasets.get_and_remove_random_subset(k)
# add this set to the labeled data
self.labeled_datasets.add_data(removed_instances)
else:
raise Exception, "No datasets have been provided!"
def get_random_unlabeled_ids(self, k):
'''
Returns a random set of k instance ids
'''
selected_ids = []
ids = self.unlabeled_datasets.get_instance_ids()
for i in range(k):
random_id = random.choice(ids)
ids.remove(random_id)
selected_ids.append(random_id)
return selected_ids
def SIMPLE(self, k):
'''
Returns the instance numbers for the k unlabeled instances closest the hyperplane.
'''
# just use the first dataset for now....
# TODO implement coin flip, etc
model = self.models[0]
# initially assume k first examples are closest
k_ids_to_distances = {}
for x in self.unlabeled_datasets.data.index[:k]:
k_ids_to_distances[x] = model.distance_to_hyperplane(self.unlabeled_datasets.get_samples().loc[x].values)
# now iterate over the rest
for x in self.unlabeled_datasets.data.index[k:]:
cur_max_id, cur_max_dist = self._get_max_val_key_tuple(k_ids_to_distances)
x_dist = model.distance_to_hyperplane(self.unlabeled_datasets.get_samples().loc[x].values)
if x_dist < cur_max_dist:
# then x is closer to the hyperplane than the farthest currently observed
# remove current max entry from the dictionary
k_ids_to_distances.pop(cur_max_id)
k_ids_to_distances[x] = x_dist
return k_ids_to_distances.keys()
def _get_max_val_key_tuple(self, d):
keys, values = d.keys(), d.values()
max_key, max_val = keys[0], values[0]
for key, value in zip(keys[1:], values[1:]):
if value > max_val:
max_key = key
max_val = value
return (max_key, max_val)
def rebuild_models(self, undersample_first=False):
'''
Rebuilds all models over the current labeled datasets.
'''
if undersample_first:
print "undersampling before building models.."
dataset = self.undersample_labeled_datasets()
print "done."
else:
dataset = self.labeled_datasets
print "training model(s) on %s instances" % dataset.data.shape[0]
self.models = []
problem = svm_problem(dataset.get_labels().values, dataset.get_samples().values)
# find C, gamma parameters for each model
print "finding optimal C, gamma parameters..."
self.params.C, self.params.gamma = grid_search(problem, self.params)
print "C:%s; gamma:%s" % (self.params.C, self.params.gamma)
self.models.append(svm_model(problem, self.params))
self.test_results = evaluate_learner(self,include_labeled_data_in_metrics=False)
print "done."