def __init__(self, params): """ Construct generic factorization model. :param params: MF runtime and algorithm parameters and options. For detailed explanation of the general model parameters see :mod:`mf_run`. For algorithm specific model options see documentation of chosen factorization method. :type params: `dict` """ self.__dict__.update(params) # check if tuples of target and factor matrices are passed if isinstance(self.V, tuple): if len(self.V) > 2: raise utils.MFError("Multiple NMF uses two target matrices.") else: self.V1 = self.V[1] self.V = self.V[0] if isinstance(self.H, tuple): if len(self.H) > 2: raise utils.MFError("Multiple NMF uses two mixture matrices.") else: self.H1 = self.H[1] self.H = self.H[0] if isinstance(self.W, tuple): raise utils.MFError("Multiple NMF uses one basis matrix.") # do not copy target and factor matrices into the program if sp.isspmatrix(self.V): self.V = self.V.tocsr().astype('d') else: self.V = np.asmatrix( self.V) if self.V.dtype == np.dtype(float) else np.asmatrix( self.V, dtype='d') if hasattr(self, "V1"): if sp.isspmatrix(self.V1): self.V1 = self.V1.tocsr().astype('d') else: self.V1 = np.asmatrix(self.V1) if self.V1.dtype == np.dtype( float) else np.asmatrix(self.V1, dtype='d') if self.W != None: if sp.isspmatrix(self.W): self.W = self.W.tocsr().astype('d') else: self.W = np.asmatrix(self.W) if self.W.dtype == np.dtype( float) else np.asmatrix(self.W, dtype='d') if self.H != None: if sp.isspmatrix(self.H): self.H = self.H.tocsr().astype('d') else: self.H = np.asmatrix(self.H) if self.H.dtype == np.dtype( float) else np.asmatrix(self.H, dtype='d') if self.H1 != None: if sp.isspmatrix(self.H1): self.H1 = self.H1.tocsr().astype('d') else: self.H1 = np.asmatrix(self.H1) if self.H1.dtype == np.dtype( float) else np.asmatrix(self.H1, dtype='d')
def __is_smdefined(self): """Check if MF and seeding methods are well defined.""" if isinstance(self.seed, str): if self.seed in seeding.methods: self.seed = seeding.methods[self.seed]() else: raise utils.MFError("Unrecognized seeding method.") else: if not str(self.seed).lower() in seeding.methods: raise utils.MFError("Unrecognized seeding method.")
def purity(self, membership=None, idx=None): """ Compute the purity given a priori known groups of samples [Park2007]_. The purity is a measure of performance of a clustering method in recovering classes defined by a list a priori known (true class labels). Return the real number in [0,1]. The larger the purity, the better the clustering performance. :param membership: Specify known class membership for each sample. :type membership: `list` :param idx: Used in the multiple NMF model. In factorizations following standard NMF model or nonsmooth NMF model ``idx`` is always None. :type idx: None or `str` with values 'coef' or 'coef1' (`int` value of 0 or 1, respectively) """ V = self.target(idx) if not membership: raise utils.MFError( "Known class membership for each sample is not specified.") n = V.shape[1] mbs = np.array(self.predict(what="samples", prob=False, idx=idx)).squeeze() dmbs, dmembership = {}, {} [dmbs.setdefault(mbs[i], set()).add(i) for i in range(len(mbs))] [ dmembership.setdefault(membership[i], set()).add(i) for i in range(len(membership)) ] return 1. / n * sum( np.max([ len(dmbs[k].intersection(dmembership[j])) for j in dmembership ]) for k in dmbs)
def _check_compatibility(self): """ Check if chosen seeding method is compatible with chosen factorization method or fixed initialization is passed. :param mf_model: The underlying initialized model of matrix factorization. :type mf_model: Class inheriting :class:`models.nmf.Nmf` """ self.check_V() W = self.basis() H = self.coef(0) H1 = self.coef(1) if self.model_name == 'mm' else None if self.seed is None and W is None and H is None and H1 is None: self.seed = None if "none" in self.aseeds else "random" if W is not None and H is not None: if self.seed is not None and self.seed is not "fixed": raise utils.MFError("Initial factorization is fixed.") else: self.seed = seeding.fixed.Fixed() self.seed._set_fixed(W=W, H=H, H1=H1) self.__is_smdefined() self.__compatibility()
def predict(self, what='samples', prob=False, idx=None): """ Compute the dominant basis components. The dominant basis component is computed as the row index for which the entry is the maximum within the column. If :param:`prob` is not specified, list is returned which contains computed index for each sample (feature). Otherwise tuple is returned where first element is a list as specified before and second element is a list of associated probabilities, relative contribution of the maximum entry within each column. :param what: Specify target for dominant basis components computation. Two values are possible, 'samples' or 'features'. When what='samples' is specified, dominant basis component for each sample is determined based on its associated entries in the mixture coefficient matrix (H). When what='features' computation is performed on the transposed basis matrix (W.T). :type what: `str` :param prob: Specify dominant basis components probability inclusion. :type prob: `bool` equivalent :param idx: Used in the multiple NMF model. In factorizations following standard NMF model or nonsmooth NMF model :param:`idx` is always None. :type idx: None or `str` with values 'coef' or 'coef1' (`int` value of 0 or 1, respectively) """ X = self.coef(idx) if what == "samples" else self.basis( ).T if what == "features" else None if X == None: raise utils.MFError( "Dominant basis components can be computed for samples or features." ) eX, idxX = argmax(X, axis=0) if not prob: return idxX sums = X.sum(axis=0) prob = [e / sums[0, s] for e, s in zip(eX, list(xrange(X.shape[1])))] return idxX, prob
def __init__(self, params): self.model_name = "smf" self.__dict__.update(params) self.V1 = None self.H1 = None # do not copy target and factor matrices into the program if sp.isspmatrix(self.V): self.V = self.V.tocsr().astype('d') else: self.V = np.asmatrix( self.V) if self.V.dtype == np.dtype(float) else np.asmatrix( self.V, dtype='d') if self.W is not None or self.H is not None or self.H1 is not None: raise utils.MFError( "Fixed initialized is not supported by SMF model.") self._compatibility()
def distance(self, metric='euclidean', idx=None): """ Return the loss function value. :param distance: Specify distance metric to be used. Possible are Euclidean and Kullback-Leibler (KL) divergence. Strictly, KL is not a metric. :type distance: `str` with values 'euclidean' or 'kl' :param idx: Used in the multiple MF model. In standard MF ``idx`` is always None. :type idx: None """ if metric.lower() == 'euclidean': R = self.V - dot(self.W, self.H) return power(R, 2).sum() elif metric.lower() == 'kl': Va = dot(self.W, self.H) return (multiply(self.V, sop(elop(self.V, Va, div), op=log)) - self.V + Va).sum() else: raise utils.MFError("Unknown distance metric.")
def __compatibility(self): """Check if MF model is compatible with the seeding method.""" if not str(self.seed).lower() in self.aseeds: raise utils.MFError( "MF model is incompatible with the seeding method.")