/
Classificator.py
72 lines (62 loc) · 2.79 KB
/
Classificator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from sklearn.svm import LinearSVC
import numpy as np
import scipy
from Blob import Blob
import logging
import time
import IAlgorithm
__author__ = 'simon'
class Classificator(IAlgorithm.IAlgorithm):
def __init__(self, classificator, use_sparse = None):
''' Trains a classificator in training phase and predicts labels in test phase
Params:
@classificator The sklearn classificator having a fit and predict function.
@use_sparse Whether to store the individual features as sparse matrix.
If None, sparsity will be used if less space is consumed in sparse format.
'''
self.model = classificator
logging.debug('Using sparse: %s'%use_sparse)
self.use_sparse = use_sparse
def _compute(self, blob_generator):
for blob in blob_generator:
blob.data = self.model.decision_function(blob.data.reshape(1,-1))
yield blob
def _train(self, blob_generator):
# First, collect all elements of the input
data = []
labels = []
metas = []
for blob in blob_generator:
if self.use_sparse is None:
# Determine automatically by comparing size
sparse_vec = scipy.sparse.csr_matrix(blob.data.ravel())
sparse_memory_req = sparse_vec.data.nbytes + sparse_vec.indptr.nbytes + sparse_vec.indices.nbytes
self.use_sparse = sparse_memory_req < blob.data.nbytes
logging.debug('Using sparse format for collecting features: %s'%self.use_sparse)
logging.debug('Blob data needs %i'%blob.data.nbytes)
logging.debug('%i with sparse vs %i with dense'%(sparse_memory_req,blob.data.nbytes))
if self.use_sparse:
data.append(scipy.sparse.csr_matrix(blob.data.ravel()))
else:
data.append(blob.data.ravel())
labels.append(blob.meta.label)
metas.append(blob.meta)
# Stack data to matrix explicitly here, as both fit and predict
# would to this stacking otherwise
try:
if self.use_sparse:
data = scipy.sparse.vstack(data)
data = data.astype(np.float64)
else:
data = np.array(data, dtype=np.float64)
except ValueError:
logging.error("Length of all feature vectors need to be the same for Classificator training.")
raise Exception
logging.warning('Training the model with feature dim %i, this might take a while'%data.shape[1])
self.model.fit(data, labels)
logging.warning('Finished')
for (d,m) in zip(self.model.decision_function(data),metas):
b = Blob()
b.data = d
b.meta = m
yield b