def __init__(self, **kw): RawDictCnv.__init__(self, toProtectedAttrs={ '_nSorts', '_nBoxes', '_nTrain', '_nValid', '_nTest', '_method', '_sort_boxes_list' } | kw.pop('toProtectedAttrs', set()), **kw)
class Subset(LoggerStreamable): # There is only need to change version if a property is added _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_ppChain'}) _cnvObj = RawDictCnv(toProtectedAttrs = {'_ppChain'}) def __init__(self, d={}, **kw): d.update( kw ) self._ppChain = d.pop('ppChain', PreProcChain(PrepObj()) ) self._range = d.pop('binRange' , None) self._patternIdx = d.pop('pattern' , 0) LoggerStreamable.__init__(self, d) def __call__(self, data): return self._apply(data) @abstractmethod def _apply(self, data): """ Overload this method to apply the pre-processing """ return self._ppChain.takeParams(data) def isRevertible(self): # Not possible to return after this return False def getBin(self): return self._range def setPatternIndex(self, idx): self._patternIdx=idx def checkPatternIndex(self,idx): if idx==self._patternIdx: return True else: return False def getPatternIndex(self): return self._patternIdx
class SomCluster( Subset ): # There is only need to change version if a property is added _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_code_book','_w'}) _cnvObj = RawDictCnv(toProtectedAttrs = {'_code_book','_w'}) def __init__(self, d={}, **kw): """ Cluster finder class base on three parameters: code_book: centroids of the cluster given by any algorithm (e.g: kmeans) w : weights, this will multipli the size of the cluster depends of the factor e.g: the cluster was found 100 events and the w factor is 2. In the end we will duplicate the events into the cluster to 200. matrix : projection apply on the centroids. p_cluster: cluster target for each neuron map """ d.update( kw ); del kw Subset.__init__(self,d) self._code_book = d.pop('code_book', []) self._p_cluster = d.pop('p_cluster', []) self._w = d.pop('w' , 1 ) checkForUnusedVars(d, self._warning ) del d # Some protections before start if type(self._code_book) is list: self._code_book = npCurrent.array(self._code_book) # If weigth factor is an integer, transform to an array of factors with the # same size of the centroids if type(self._w) is int: self._w = npCurrent.int_array([self._w for i in range(self._code_book.shape[0])] ) # transform to np.array if needed if type(self._w) is list: self._w = npCurrent.int_array(self._w) # In case to pass a list of weights, we need to check if weights and centroids has the same length. if self._w.shape[0] != self._code_book.shape[0]: raise ValueError("Weight factor must be an int, list or np.array with the same size than the code book param") #__init__ end def __call__(self, data): return self._apply(data) def _apply(self,data): """ This function is slower than the C version but works for all input types. If the inputs have the wrong types for the C versions of the function, this one is called as a last resort. It is about 20 times slower than the C version. """ # Take param and apply pre-processing # hold the unprocess data self._ppChain.takeParams(data) tdata = self._ppChain(data) # n = number of observations # d = number of features if np.ndim(tdata) == 1: if not np.ndim(tdata) == np.ndim(self._code_book): raise ValueError("Observation and code_book should have the same rank") else: (n, d) = tdata.shape # code books and observations should have same number of features and same shape if not np.ndim(tdata) == np.ndim(self._code_book): raise ValueError("Observation and code_book should have the same rank") elif not d == self._code_book.shape[1]: raise ValueError("Code book(%d) and obs(%d) should have the same " "number of features (eg columns)""" % (self._code_book.shape[1], d)) bmus = tensor_frobenius_argmin(tdata,self._code_book,10000,self._logger) code = np.zeros(bmus.shape) # Fix matlab index self._p_cluster = self._p_cluster-1 for n in range(bmus.shape[0]): code[n] = self._p_cluster[bmus[n]] # Release memory del tdata gc.collect() # Join all clusters into a list of clusters cpattern=[] for target in range(max(self._p_cluster)+1): cpattern.append(data[np.where(code==target)[0],:]) # Resize the cluster for i, c in enumerate(cpattern): cpattern[i] = np.repeat(cpattern[i],self._w[i],axis=0) self._info('Cluster %d and factor %d with %d events and %d features', i,self._w[i],cpattern[i].shape[0],cpattern[i].shape[1]) return cpattern
class GMMCluster( Cluster ): # There is only need to change version if a property is added _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_sigma'}) _cnvObj = RawDictCnv(toProtectedAttrs = {'_sigma'}) def __init__(self, d={}, **kw): """ Cluster finder class base on three parameters: code_book: centroids of the cluster given by any algorithm (e.g: kmeans) w : weights, this will multipli the size of the cluster depends of the factor e.g: the cluster was found 100 events and the w factor is 2. In the end we will duplicate the events into the cluster to 200. matrix : projection apply on the centroids. sigma : variance param of the gaussian, this algorithm will calculate the likelihood value using: lh[i] = np.exp(np.power((data-centroid[i])/sigma[i],2)) """ d.update( kw ); del kw self._sigma = d.pop('sigma' , npCurrent.array([]) ) Cluster.__init__(self, d) del d # Checking the sigma type if type(self._sigma) is list: self._sigma = npCurrent.array(self._sigma) if not self._sigma.shape == self._code_book.shape: raise ValueError("Code book and sigma matrix should have the same shape") #__init__ end def _apply(self,data): """ This function is slower than the C version but works for all input types. If the inputs have the wrong types for the C versions of the function, this one is called as a last resort. It is about 20 times slower than the C version. """ # Take param and apply pre-processing # hold the unprocess data self._ppChain.takeParams(data) tdata = self._ppChain(data) # n = number of observations # d = number of features if np.ndim(tdata) == 1: if not np.ndim(tdata) == np.ndim(self._code_book): raise ValueError("Observation and code_book should have the same rank") else: (n, d) = tdata.shape # code books and observations should have same number of features and same shape if not np.ndim(tdata) == np.ndim(self._code_book): raise ValueError("Observation and code_book should have the same rank") elif not d == self._code_book.shape[1]: raise ValueError("Code book(%d) and obs(%d) should have the same " "number of features (eg columns)""" % (self._code_book.shape[1], d)) # Prob finder equation is: # tdata is n X d # code_book is m X d where m is the number of clusters # Sigma is m X d # Prob = exp() # see here: http://scipy.github.io/old-wiki/pages/EricsBroadcastingDoc code = np.argmax(np.sum(np.exp(np.power((tdata[:,np.newaxis]-self._code_book),2)/self._sigma[np.newaxis,:]),axis=-1),axis=1) del tdata gc.collect() # Join all clusters into a list of clusters cpattern=[] for target in range(self._code_book.shape[0]): cpattern.append(data[np.where(code==target)[0],:]) # Resize the cluster for i, c in enumerate(cpattern): cpattern[i] = np.repeat(cpattern[i],self._w[i],axis=0) self._info('Cluster %d and factor %d with %d events and %d features',\ i,self._w[i],cpattern[i].shape[0],cpattern[i].shape[1]) return cpattern
def __init__(self, **kw): RawDictCnv.__init__( self, toProtectedAttrs = {'_nSorts','_nBoxes', '_nTrain','_nValid', '_nTest', '_method','_sort_boxes_list'} | kw.pop('toProtectedAttrs', set()), **kw )