def learnModel(self, X): """ Learn X using a matrix factorisation method. If self.rank is an integer then we factorise with that rank. If it is an array then we compute the complete regularisation path and return a list of matrices. """ if isinstance(self.rank, int): model = nimfa.mf(X, method=self.method, max_iter=self.maxIter, rank=self.rank) fit = nimfa.mf_run(model) W = fit.basis() H = fit.coef() predX = W.dot(H) return predX else: predXList = [] model = nimfa.mf(X, method=self.method, max_iter=self.maxIter, rank=self.rank[0]) fit = nimfa.mf_run(model) W = fit.basis() H = fit.coef() predXList.append(W.dot(H)) for i in range(1, self.rank.shape[0]): model = nimfa.mf(X, method=self.method, max_iter=self.maxIter, rank=self.rank[i], W=W, H=H) fit = nimfa.mf_run(model) W = fit.basis() H = fit.coef() predXList.append(W.dot(H)) return predXList
def run_nmf(V): """ Run standard nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ # Euclidean rank = 10 model = nimfa.mf(V, seed="random_vcol", rank=rank, method="nmf", max_iter=12, initialize_only=True, update='euclidean', objective='fro') fit = nimfa.mf_run(model) print_info(fit) # divergence model = nimfa.mf(V, seed="random_vcol", rank=rank, method="nmf", max_iter=12, initialize_only=True, update='divergence', objective='div') fit = nimfa.mf_run(model) print_info(fit)
def nmf(Xtrn, Xtst): # Init matricies Xtrn_n = np.shape(Xtrn)[0] Xtst_n = np.shape(Xtst)[0] Xtrn_nmf = np.zeros((Xtrn_n, my_rank)) Xtst_nmf = np.zeros((Xtst_n, my_rank)) print(file_name + ': Running non-negative matrix facorization w/ rank = ' + str(my_rank)) #Xtrn_fctr = nimfa.mf(Xtrn, method = 'nmf', seed = "fixed", max_iter = iters, # rank = my_rank, update = 'euclidean', objective = 'fro') print(file_name + ': \t on traning...') for i in xrange(Xtrn_n): Xtrn_fctr = nimfa.mf(Xtrn[i,:], method = 'lsnmf', max_iter = iters, rank = my_rank) Xtrn_res = nimfa.mf_run(Xtrn_fctr) Xtrn_nmf[i,:] = Xtrn_res.basis() if (i%10000 == 0): print(file_name + ': \t iter ' + str(i)) print(file_name + ' \t on testing...') for i in xrange(Xtst_n): Xtst_fctr = nimfa.mf(Xtst[i,:], method = 'lsnmf', max_iter = iters, rank = my_rank) Xtst_res = nimfa.mf_run(Xtrn_fctr) Xtst_nmf[i,:] = Xtst_res.basis() if (i%10000 == 0): print(file_name + ': \t iter ' + str(i)) """ Xtrn_sm = Xtrn_res.summary() Xtst_sm = Xtst_res.summary() print(file_name + ': \t\t RSS \t Explained Var \t Iters') print(file_name + ': Xtrn: \t' + str(Xtrn_sm['rss']) + '\t' + str(Xtrn_sm['evar']) + '\t' + str(Xtrn_sm['n_iter'])) print(file_name + ': Xtst: ' + str(Xtst_sm['rss']) + '\t' + str(Xtst_sm['evar']) + '\t' + str(Xtst_sm['n_iter'])) """ return (Xtrn_nmf, Xtst_nmf)
def run_nmf(V): """ Run standard nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ # Euclidean rank = 10 model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "nmf", max_iter = 12, initialize_only = True, update = 'euclidean', objective = 'fro') fit = nimfa.mf_run(model) print_info(fit) # divergence model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "nmf", max_iter = 12, initialize_only = True, update = 'divergence', objective = 'div') fit = nimfa.mf_run(model) print_info(fit)
def run_bd(V): """ Run Bayesian decomposition. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed = "random_c", rank = rank, method = "bd", max_iter = 12, initialize_only = True, alpha = np.mat(np.zeros((V.shape[0], rank))), beta = np.mat(np.zeros((rank, V.shape[1]))), theta = .0, k = .0, sigma = 1., skip = 100, stride = 1, n_w = np.mat(np.zeros((rank, 1))), n_h = np.mat(np.zeros((rank, 1))), n_sigma = False) fit = nimfa.mf_run(model) print_info(fit)
def factorize(V): """ Perform SNMF/R factorization on the sparse MovieLens data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The MovieLens data matrix. :type V: `scipy.sparse.csr_matrix` """ model = nimfa.mf(V, seed="random_vcol", rank=12, method="snmf", max_iter=15, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) print("Performing %s %s %d factorization ..." % (model, model.seed, model.rank)) fit = nimfa.mf_run(model) print("... Finished") sparse_w, sparse_h = fit.fit.sparseness() print("""Stats: - iterations: %d - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(metric='euclidean'), sparse_w, sparse_h)) return fit.basis(), fit.coef()
def run_snmnmf(V, V1): """ Run sparse network-regularized multiple NMF. :param V: First target matrix to estimate. :type V: :class:`numpy.matrix` :param V1: Second target matrix to estimate. :type V1: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(target = (V, V1), seed = "random_c", rank = rank, method = "snmnmf", max_iter = 12, initialize_only = True, A = abs(sp.rand(V1.shape[1], V1.shape[1], density = 0.7, format = 'csr')), B = abs(sp.rand(V.shape[1], V1.shape[1], density = 0.7, format = 'csr')), gamma = 0.01, gamma_1 = 0.01, lamb = 0.01, lamb_1 = 0.01) fit = nimfa.mf_run(model) # print all quality measures concerning first target and mixture matrix in multiple NMF print_info(fit, idx = 0) # print all quality measures concerning second target and mixture matrix in multiple NMF print_info(fit, idx = 1)
def run(self, seed='random_vcol', method='nmf', rank=3, max_iter=65, display_N_tokens=5, display_N_documents=3): #Re-initialise clusters if self.clusters != []: self.clusters = [] self.construct_term_doc_matrix( pca=False ) #We cannot perform PCA with NMF because we only want non-negative vectors V = self.td_matrix model = nimfa.mf(V, seed=seed, method=method, rank=rank, max_iter=max_iter) fitted = nimfa.mf_run(model) w = fitted.basis() h = fitted.coef() self.split_documents(w, h, self.document_dict, self.attributes, display_N_tokens=display_N_tokens, display_N_documents=display_N_documents) #Just testing remove it self.showfeatures(w, h, [ self.document_dict.values()[i]["raw"] for i in range(numpy.shape(w)[0]) ], self.attributes)
def decompose_nmf(spectrum_array,n_spectra): print '\nDecomposing spectra using NMF...' fctr = nimfa.mf(spectrum_array, method="nmf", max_iter=10000, rank=n_spectra, update='divergence', objective='div') fctr_res = nimfa.mf_run(fctr) a=n.transpose(n.array(fctr_res.basis())) coeffs=n.array(fctr_res.coef()) return a,coeffs
def factorization(V, rank=4): """ use nmf to factorize V :rtype : (1) the projection matrix (2) the feature vector of V """ fctr = nimfa.mf( V, method="nmf", max_iter=30, rank=rank, update="divergence", objective="div", callback_init=init_info, callback=init_info, ) fctr_res = nimfa.mf_run(fctr) print "calculate generized inverse" projection = pinv(fctr_res.basis().todense()) print "inverse finished" return { "projection": projection, "feature": (projection * V), "basis": fctr_res.basis(), "coef": fctr_res.coef().todense(), }
def factorize(V): """ Perform LSNMF factorization on the ORL faces data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The ORL faces data matrix. :type V: `numpy.matrix` """ model = nimfa.mf(V, seed = "random_vcol", rank = 25, method = "lsnmf", max_iter = 50, initialize_only = True, sub_iter = 10, inner_sub_iter = 10, beta = 0.1, min_residuals = 1e-8) print "Performing %s %s %d factorization ..." % (model, model.seed, model.rank) fit = nimfa.mf_run(model) print "... Finished" print """Stats: - iterations: %d - final projected gradients norm: %5.3f - Euclidean distance: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric = 'euclidean')) return fit.basis(), fit.coef()
def run(self, **params): if not self.dataConsolided: print "NIMFA_SNMNMF: preparing data" self.consolideTheData() self.dataConsolided = True print "NIMFA_SNMNMF: starting" # V = self.miRNA.as_matrix() V1 = self.mRNA.as_matrix() A = csr_matrix(self.gene2gene) B = csr_matrix(self.miRNA2gene) fctr = nimfa.mf(target = (V, V1), seed = params['seed'], # e.g., "random_c", rank = params['rank'], # e.g., 50, method = "snmnmf", max_iter = params['max_iter'], # e.g., 500, initialize_only = True, A = A , B = B, n_run = 3, gamma = self.g1, gamma_1 = self.g2, lamb = self.l1, lamb_1 = self.l2) fctr_res = nimfa.mf_run(fctr) print "NIMFA_SNMNMF: done" # extract the results self.W = DataFrame(fctr_res.basis(), index = self.miRNA.index) self.H1_miRNA = DataFrame(fctr_res.coef(0), columns = self.miRNA.columns) self.H2_genes = DataFrame(fctr_res.coef(1), columns = self.mRNA.columns) self.performance = NIMFA_SNMNMFPerformance(fctr_res)
def nmfMatrix(self, V): print "---" print "NMF" print "---" V = np.array(V) print "Target matrix" print V fctr = nimfa.mf(V, seed = 'random_vcol', method = 'lsnmf', rank = 40, max_iter = 10) fctr_res = nimfa.mf_run(fctr) W = fctr_res.basis() print "Basis matrix" print W H = fctr_res.coef() print "Coef" print H print "Estimate" print np.dot(W, H) print 'Rss: %5.4f' % fctr_res.fit.rss() print 'Evar: %5.4f' % fctr_res.fit.evar() print 'K-L divergence: %5.4f' % fctr_res.distance(metric = 'kl') print 'Sparseness, W: %5.4f, H: %5.4f' % fctr_res.fit.sparseness() return W, H
def factorize(V): """ Perform NMF - Divergence factorization on the sparse Medlars data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The Medlars data matrix. :type V: `scipy.sparse.csr_matrix` """ model = nimfa.mf(V, seed = "random_vcol", rank = 12, method = "nmf", max_iter = 15, initialize_only = True, update = 'divergence', objective = 'div') print "Performing %s %s %d factorization ..." % (model, model.seed, model.rank) fit = nimfa.mf_run(model) print "... Finished" sparse_w, sparse_h = fit.fit.sparseness() print """Stats: - iterations: %d - KL Divergence: %5.3f - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric = 'euclidean'), sparse_w, sparse_h) return fit.basis(), fit.coef()
def factorize(V): """ Perform NMF - Divergence factorization on the sparse Medlars data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The Medlars data matrix. :type V: `scipy.sparse.csr_matrix` """ model = nimfa.mf(V, seed="random_vcol", rank=12, method="nmf", max_iter=15, initialize_only=True, update='divergence', objective='div') print("Performing %s %s %d factorization ..." % (model, model.seed, model.rank)) fit = nimfa.mf_run(model) print("... Finished") sparse_w, sparse_h = fit.fit.sparseness() print("""Stats: - iterations: %d - KL Divergence: %5.3f - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric='euclidean'), sparse_w, sparse_h)) return fit.basis(), fit.coef()
def run_snmnmf(V, V1): """ Run sparse network-regularized multiple NMF. :param V: First target matrix to estimate. :type V: :class:`numpy.matrix` :param V1: Second target matrix to estimate. :type V1: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf( target=(V, V1), seed="random_c", rank=rank, method="snmnmf", max_iter=12, initialize_only=True, A=abs(sp.rand(V1.shape[1], V1.shape[1], density=0.7, format='csr')), B=abs(sp.rand(V.shape[1], V1.shape[1], density=0.7, format='csr')), gamma=0.01, gamma_1=0.01, lamb=0.01, lamb_1=0.01) fit = nimfa.mf_run(model) # print all quality measures concerning first target and mixture matrix in # multiple NMF print_info(fit, idx=0) # print all quality measures concerning second target and mixture matrix # in multiple NMF print_info(fit, idx=1)
def factorize(V): """ Perform LSNMF factorization on the CBCL faces data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The CBCL faces data matrix. :type V: `numpy.matrix` """ model = nimfa.mf(V, seed="random_vcol", rank=49, method="lsnmf", max_iter=50, initialize_only=True, sub_iter=10, inner_sub_iter=10, beta=0.1, min_residuals=1e-8) print("Performing %s %s %d factorization ..." % (model, model.seed, model.rank)) fit = nimfa.mf_run(model) print("... Finished") sparse_w, sparse_h = fit.fit.sparseness() print("""Stats: - iterations: %d - final projected gradients norm: %5.3f - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric='euclidean'), sparse_w, sparse_h)) return fit.basis(), fit.coef()
def max_guess_select(ratings, users, rank=9, user=None): matrix = sp.dok_matrix((len(users), len(users))) for k, v in ratings.items(): matrix[users[k[0]], users[k[1]]] = v matrix[users[k[1]], users[k[0]]] = v # Run sparse matrix factorisation factor = nimfa.mf(matrix, seed="random_c", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) result = nimfa.mf_run(factor) if user is None: # Pick a user to expand user = min(users, key=lambda u: len([i for i in ratings if u in i])) recommendations = result.fitted() rval = max([ i for i in users if (i, user) not in ratings and (user, i) not in ratings ], key=lambda x: recommendations[users[user], users[x]]) return user, rval
def factorize(V): """ Perform SNMF/R factorization on the sparse MovieLens data matrix. Return basis and mixture matrices of the fitted factorization model. :param V: The MovieLens data matrix. :type V: `scipy.sparse.csr_matrix` """ model = nimfa.mf(V, seed="random_vcol", rank=12, method="snmf", max_iter=15, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) print "Performing %s %s %d factorization ..." % (model, model.seed, model.rank) fit = nimfa.mf_run(model) print "... Finished" sparse_w, sparse_h = fit.fit.sparseness() print """Stats: - iterations: %d - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(metric='euclidean'), sparse_w, sparse_h) return fit.basis(), fit.coef()
def run_bd(V): """ Run Bayesian decomposition. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed="random_c", rank=rank, method="bd", max_iter=12, initialize_only=True, alpha=np.mat(np.zeros((V.shape[0], rank))), beta=np.mat(np.zeros((rank, V.shape[1]))), theta=.0, k=.0, sigma=1., skip=100, stride=1, n_w=np.mat(np.zeros((rank, 1))), n_h=np.mat(np.zeros((rank, 1))), n_sigma=False) fit = nimfa.mf_run(model) print_info(fit)
def run(self, **params): if not self.dataConsolided: print "NIMFA_SNMNMF: preparing data" self.consolideTheData() self.dataConsolided = True print "NIMFA_SNMNMF: starting" # V = self.miRNA.as_matrix() V1 = self.mRNA.as_matrix() A = csr_matrix(self.gene2gene) B = csr_matrix(self.miRNA2gene) fctr = nimfa.mf( target=(V, V1), seed=params['seed'], # e.g., "random_c", rank=params['rank'], # e.g., 50, method="snmnmf", max_iter=params['max_iter'], # e.g., 500, initialize_only=True, A=A, B=B, n_run=1, gamma=self.g1, gamma_1=self.g2, lamb=self.l1, lamb_1=self.l2) fctr_res = nimfa.mf_run(fctr) print "NIMFA_SNMNMF: done" # extract the results self.W = DataFrame(fctr_res.basis(), index=self.miRNA.index) self.H1_miRNA = DataFrame(fctr_res.coef(0), columns=self.miRNA.columns) self.H2_genes = DataFrame(fctr_res.coef(1), columns=self.mRNA.columns) self.performance = NIMFA_SNMNMFPerformance(fctr_res)
def nmf(matrix, k=c_K): fctr = nimfa.mf(matrix, seed='random_vcol', method='lsnmf', rank=k, max_iter=c_NMF_MAXITR) fctr_result = nimfa.mf_run(fctr) return fctr_result.basis(), fctr_result.coef()
def nmfMatrix(self, V, method, rank, maxIter): print "---" print "NMF" print "---" V = np.array(V) print "Target matrix" print V.shape[0] print V.shape[1] print V # X = sp.rand(V.shape[0], V.shape[1], density=1).tocsr() # NMFの際の、基底数やイテレーションの設定 # rank = 8 # maxIter = 2000 # method = "snmf" # init2arizer = nimfa.methods.seeding.random_vcol.Random_vcol() initiarizer = nimfa.methods.seeding.random.Random() initW, initH = initiarizer.initialize(V, rank, {}) fctr = nimfa.mf(V, seed = 'random_vcol', method = method, rank = rank, max_iter = maxIter) # fctr = nimfa.mf(V, method = "lsnmf", rank = rank, max_iter = maxIter, W = initW, H = initH) fctr_res = nimfa.mf_run(fctr) W = fctr_res.basis() print "Basis matrix" print W.shape[0] print W.shape[1] print W H = fctr_res.coef() print "Coef" print H.shape[0] print H.shape[1] print H print "Estimate" print np.dot(W, H) print 'Rss: %5.4f' % fctr_res.fit.rss() print 'Evar: %5.4f' % fctr_res.fit.evar() print 'K-L divergence: %5.4f' % fctr_res.distance(metric = 'kl') print 'Sparseness, W: %5.4f, H: %5.4f' % fctr_res.fit.sparseness() sm = fctr_res.summary() print type(sm) # print "Rss: %8.3f" % sm['rss'] # # Print explained variance. # print "Evar: %8.3f" % sm['evar'] # # Print actual number of iterations performed # print "Iterations: %d" % sm['n_iter'] # プロットの際に不具合が生じるため,numpy.ndarray型に変換 NW = np.asarray(W) NH = np.asarray(H) return NW, NH, sm
def _NIMFA_NMF(self, X, nBases): model = nimfa.mf(X, seed="nndsvd", rank=nBases, method="nmf", initialize_only=True) fit = nimfa.mf_run(model) W = fit.basis() H = fit.coef() self.W = W.todense() self.H = H.todense() return (self.W, self.H)
def cluster_nmf(vectors, num_clusters): """ Takes in vectors and clusters them using Non Negative Matrix Factorization. Inputs: vectors -- matrix containing rows of vectors num_clusters -- number of clusters to create """ print "Starting NMF clustering" start_time = time.time() # Run NMF vectors_matrix = numpy.matrix(vectors) vectors_matrix = vectors_matrix.transpose() print "Created vectors_matrix" # Generate random matrix factors which we will pass as fixed factors to nimfa.nmf init_W = numpy.random.rand(vectors_matrix.shape[0], num_clusters) init_H = numpy.random.rand(num_clusters, vectors_matrix.shape[1]) print "Generated random matrix factors" fctr = nimfa.mf(vectors_matrix, method="nmf", seed="fixed", W=init_W, H=init_H, rank=num_clusters) fctr_res = nimfa.mf_run(fctr) print "NIMFA" # Basis matrix W = fctr_res.basis() # Mixture matrix H = fctr_res.coef() print "Extracted Basis and Mixture matrices" # get assignments assignment = [] for index in range(H.shape[1]): column = list(H[:, index]) assignment.append(column.index(max(column))) print "Assignments extracted" # Print the loss function (Euclidean distance between target matrix and its estimate). print "Euclidean distance: %5.3e" % fctr_res.distance(metric="euclidean") end_time = time.time() print "Clustering required", (end_time - start_time), "seconds" return assignment
def _NIMFA_NMF(self, X, nBases): model = nimfa.mf(X, seed = 'nndsvd', rank = nBases, method = "nmf", initialize_only = True) fit = nimfa.mf_run(model) W = fit.basis() H = fit.coef() self.W = W.todense() self.H = H.todense() return (self.W, self.H)
def max_guess_select(ratings, users, rank=9, user=None): matrix = sp.dok_matrix((len(users), len(users))) for k, v in ratings.items(): matrix[users[k[0]], users[k[1]]] = v matrix[users[k[1]], users[k[0]]] = v # Run sparse matrix factorisation factor = nimfa.mf(matrix, seed="random_c", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) result = nimfa.mf_run(factor) if user is None: # Pick a user to expand user = min(users, key=lambda u: len([i for i in ratings if u in i])) recommendations = result.fitted() rval = max([i for i in users if (i, user) not in ratings and (user, i) not in ratings], key=lambda x: recommendations[users[user], users[x]]) return user, rval
def fit(self, k=100, max_iter=15, method='lsnmf'): if self.recommender_data.preference_matrix.shape[1] < k: k = self.recommender_data.preference_matrix.shape[1] model = nimfa.mf(self.recommender_data.preference_matrix, seed="random_vcol", rank=k, method=method, max_iter=max_iter) fit = nimfa.mf_run(model) self.user_matrix = fit.basis().todense() self.item_matrix = fit.coef().todense()
def _factorize(matrix): "Factorize the matrix to get pc" # Build the model model = mf(matrix, seed="random_vcol", rank=15, method="nmf", max_iter=15, initialize_only=True, update='divergence', objective='div') # Then fit it fit = mf_run(model) return fit.basis(), fit.coef()
def run(self, seed = 'random_vcol', method='nmf', rank=3, max_iter=65, display_N_tokens = 5, display_N_documents = 3): #Re-initialise clusters if self.clusters != []: self.clusters = [] self.construct_term_doc_matrix(pca=False) #We cannot perform PCA with NMF because we only want non-negative vectors V = self.td_matrix model = nimfa.mf(V, seed = seed, method = method, rank = rank, max_iter = max_iter) fitted = nimfa.mf_run(model) w = fitted.basis() h = fitted.coef() self.split_documents(w,h, self.document_dict, self.attributes, display_N_tokens = display_N_tokens, display_N_documents = display_N_documents) #Just testing remove it self.showfeatures(w,h, [self.document_dict.values()[i]["raw"] for i in range(numpy.shape(w)[0])], self.attributes)
def run_nmf(): file_name = inspect.getfile(inspect.currentframe()) # Read in pre-processed matricies print(file_name + ': Reading train/test matrix w/ dim = ' + f_in_trn) Xtrn = ensure_dim(np.loadtxt(open(f_in_trn, 'rb'), delimiter = ',', skiprows = 0)) Xtst = ensure_dim(np.loadtxt(open(f_in_tst, 'rb'), delimiter = ',', skiprows = 0)) # Run nmf print(file_name + ': Running non-negative matrix facorization w/ rank = ' + my_rank) nmf = nimfa.mf(Xtrn, method = 'nmf', max_iter = iters, rank = my_rank) # Output submission print(file_name + ': Saving csv to ' + f_out) colfmt = ['%i'] + ['%f'] * (Ytst.shape[1] - 1) np.savetxt(f_out, Ytst, delimiter = ',', fmt = colfmt)
def run_snmf(V): """ Run sparse nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ # SNMF/R rank = 10 model = nimfa.mf(V, seed="random_c", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) fit = nimfa.mf_run(model) print_info(fit) # SNMF/L model = nimfa.mf(V, seed="random_vcol", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='l', eta=1., beta=1e-4, i_conv=10, w_min_change=0) fit = nimfa.mf_run(model) print_info(fit)
def run_snmf(V): """ Run sparse nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ # SNMF/R rank = 10 model = nimfa.mf(V, seed = "random_c", rank = rank, method = "snmf", max_iter = 12, initialize_only = True, version = 'r', eta = 1., beta = 1e-4, i_conv = 10, w_min_change = 0) fit = nimfa.mf_run(model) print_info(fit) # SNMF/L model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "snmf", max_iter = 12, initialize_only = True, version = 'l', eta = 1., beta = 1e-4, i_conv = 10, w_min_change = 0) fit = nimfa.mf_run(model) print_info(fit)
def factor_eval(data, ranks, nrun=40, method="nmf", max_iter=2000): coefs = [] for rank in ranks: fctr = nimfa.mf(data, method=method, max_iter=max_iter, rank=rank, n_run=nrun, track_factor=True) fctr_res = nimfa.mf_run(fctr) sm = fctr_res.summary() coef = sm['cophenetic'] print coef coefs.append(coef) return coefs
def run_one(V, rank): """ Run standard NMF on leukemia data set. 50 runs of Standard NMF are performed and obtained consensus matrix averages all 50 connectivity matrices. :param V: Target matrix with gene expression data. :type V: `numpy.matrix` (of course it could be any format of scipy.sparse, but we will use numpy here) :param rank: Factorization rank. :type rank: `int` """ print "================= Rank = %d =================" % rank consensus = np.mat(np.zeros((V.shape[1], V.shape[1]))) for i in xrange(50): # Standard NMF with Euclidean update equations is used. For initialization random Vcol method is used. # Objective function is the number of consecutive iterations in which the connectivity matrix has not changed. # We demand that factorization does not terminate before 30 consecutive iterations in which connectivity matrix # does not change. For a backup we also specify the maximum number of iterations. Note that the satisfiability # of one stopping criteria terminates the run (there is no chance for divergence). model = nimfa.mf( V, method="nmf", rank=rank, seed="random_vcol", max_iter=200, update="euclidean", objective="conn", conn_change=40, initialize_only=True, ) fit = nimfa.mf_run(model) print "%2d / 50 :: %s - init: %s ran with ... %3d / 200 iters ..." % ( i + 1, fit.fit, fit.fit.seed, fit.fit.n_iter, ) # Compute connectivity matrix of factorization. # Again, we could use multiple runs support of the nimfa library, track factorization model across 50 runs and then # just call fit.consensus() consensus += fit.fit.connectivity() # averaging connectivity matrices consensus /= 50.0 # reorder consensus matrix p_consensus = reorder(consensus) # plot reordered consensus matrix plot(p_consensus, rank)
def run_nsnmf(V): """ Run nonsmooth nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed = "random", rank = rank, method = "nsnmf", max_iter = 12, initialize_only = True, theta = 0.5) fit = nimfa.mf_run(model) print_info(fit)
def run_pmf(V): """ Run probabilistic matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "pmf", max_iter = 12, initialize_only = True, rel_error = 1e-5) fit = nimfa.mf_run(model) print_info(fit)
def run_nsnmf(V): """ Run nonsmooth nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed="random", rank=rank, method="nsnmf", max_iter=12, initialize_only=True, theta=0.5) fit = nimfa.mf_run(model) print_info(fit)
def run_pmf(V): """ Run probabilistic matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed="random_vcol", rank=rank, method="pmf", max_iter=12, initialize_only=True, rel_error=1e-5) fit = nimfa.mf_run(model) print_info(fit)
def run_bmf(V): """ Run binary matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed="random_vcol", rank=rank, method="bmf", max_iter=12, initialize_only=True, lambda_w=1.1, lambda_h=1.1) fit = nimfa.mf_run(model) print_info(fit)
def run_psmf(V): """ Run probabilistic sparse matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 prng = np.random.RandomState() model = nimfa.mf(V, seed=None, rank=rank, method="psmf", max_iter=12, initialize_only=True, prior=prng.uniform(low=0., high=1., size=10)) fit = nimfa.mf_run(model) print_info(fit)
def run_bmf(V): """ Run binary matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "bmf", max_iter = 12, initialize_only = True, lambda_w = 1.1, lambda_h = 1.1) fit = nimfa.mf_run(model) print_info(fit)
def nmf(X, method='sklearn', **nmfparams): """ Calculates the non-negative matrix factorization of an input matrix """ #TODO: Documentation if method == 'sklearn': model = NMF(**nmfparams) H = model.fit_transform(X) W = model.components_ elif method == 'nimfa': model_tmp = nimfa.mf(X, **nmfparams) model = nimfa.mf_run(model_tmp) H = model.coef() W = model.basis() return (H, W, model)
def run_psmf(V): """ Run probabilistic sparse matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 prng = np.random.RandomState() model = nimfa.mf(V, seed = None, rank = rank, method = "psmf", max_iter = 12, initialize_only = True, prior = prng.uniform(low = 0., high = 1., size = 10)) fit = nimfa.mf_run(model) print_info(fit)
def run_lfnmf(V): """ Run local fisher nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 pnrg = np.random.RandomState() model = nimfa.mf(V, seed = None, W = abs(pnrg.randn(V.shape[0], rank)), H = abs(pnrg.randn(rank, V.shape[1])), rank = rank, method = "lfnmf", max_iter = 12, initialize_only = True, alpha = 0.01) fit = nimfa.mf_run(model) print_info(fit)
def run_lfnmf(V): """ Run local fisher nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 pnrg = np.random.RandomState() model = nimfa.mf(V, seed=None, W=abs(pnrg.randn(V.shape[0], rank)), H=abs(pnrg.randn(rank, V.shape[1])), rank=rank, method="lfnmf", max_iter=12, initialize_only=True, alpha=0.01) fit = nimfa.mf_run(model) print_info(fit)
def run_lsnmf(V): """ Run least squares nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed="random_vcol", rank=rank, method="lsnmf", max_iter=12, initialize_only=True, sub_iter=10, inner_sub_iter=10, beta=0.1, min_residuals=1e-5) fit = nimfa.mf_run(model) print_info(fit)
def run_lsnmf(V): """ Run least squares nonnegative matrix factorization. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 model = nimfa.mf(V, seed = "random_vcol", rank = rank, method = "lsnmf", max_iter = 12, initialize_only = True, sub_iter = 10, inner_sub_iter = 10, beta = 0.1, min_residuals = 1e-5) fit = nimfa.mf_run(model) print_info(fit)
def run_factorization(data): fctr = nimfa.mf(data, seed = "random_c", rank = 15, method = "snmf", max_iter = 50, initialize_only = True, version = 'r', eta = 1., beta = 1e-4, i_conv = 10, w_min_change = 0) fctr_res = nimfa.mf_run(fctr) np.set_printoptions(precision=3) np.set_printoptions(suppress=True) # Basis matrix. It is sparse, as input data was sparse as well. W = fctr_res.basis() # print "Basis matrix" # print W.todense() # print W.shape # Mixture matrix. We print this tiny matrix in dense format. H = fctr_res.coef() # print "Coef" # print H.todense() # print H.shape # Return the loss function according to Kullback-Leibler divergence. By default Euclidean metric is used. print "Distance Kullback-Leibler: %5.3e" % fctr_res.distance(metric = "kl") # Compute generic set of measures to evaluate the quality of the factorization sm = fctr_res.summary() # Print sparseness (Hoyer, 2004) of basis and mixture matrix print "Sparseness Basis: %5.3f Mixture: %5.3f" % (sm['sparseness'][0], sm['sparseness'][1]) # Print actual number of iterations performed print "Iterations: %d" % sm['n_iter'] # Print estimate of target matrix data data_fact = np.dot(W.todense(), H.todense() ) rmse = 0.0 for i in range(data.shape[0]): for j in range(data.shape[1]): if data[i][j] > 0: print data[i,j], data_fact[i, j] rmse += (data[i, j] - data_fact[i, j])**2 print "RMSE:", rmse print data, data_fact
def factorize(data): """ Perform factorization on S. cerevisiae FunCat annotated sequence data set (D1 FC seq). Return factorized data, this is matrix factors as result of factorization (basis and mixture matrix). :param data: Transformed data set containing attributes' values, class information and possibly additional meta information. :type data: `tuple` """ V = data['attr'] """model = nimfa.mf(V, seed = "random_vcol", rank = 40, method = "nmf", max_iter = 75, initialize_only = True, update = 'euclidean', objective = 'fro')""" model = nimfa.mf(V, seed="random_vcol", rank=40, method="snmf", max_iter=5, initialize_only=True, version='l', eta=1., beta=1e-4, i_conv=10, w_min_change=0) print("Performing %s %s %d factorization ..." % (model, model.seed, model.rank)) fit = nimfa.mf_run(model) print("... Finished") sparse_w, sparse_h = fit.fit.sparseness() print("""Stats: - iterations: %d - KL Divergence: %5.3f - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric='euclidean'), sparse_w, sparse_h)) data['W'] = fit.basis() data['H'] = fit.coef() return data
def run_one(V, rank): """ Run standard NMF on medulloblastoma data set. 50 runs of Standard NMF are performed and obtained consensus matrix averages all 50 connectivity matrices. :param V: Target matrix with gene expression data. :type V: `numpy.matrix` (of course it could be any format of scipy.sparse, but we will use numpy here) :param rank: Factorization rank. :type rank: `int` """ print("================= Rank = %d =================" % rank) consensus = np.mat(np.zeros((V.shape[1], V.shape[1]))) for i in range(50): # Standard NMF with Euclidean update equations is used. For initialization random Vcol method is used. # Objective function is the number of consecutive iterations in which the connectivity matrix has not changed. # We demand that factorization does not terminate before 30 consecutive iterations in which connectivity matrix # does not change. For a backup we also specify the maximum number of iterations. Note that the satisfiability # of one stopping criteria terminates the run (there is no chance for # divergence). model = nimfa.mf(V, method="nmf", rank=rank, seed="random_vcol", max_iter=200, update='euclidean', objective='conn', conn_change=40, initialize_only=True) fit = nimfa.mf_run(model) print("%2d / 50 :: %s - init: %s ran with ... %3d / 200 iters ..." % (i + 1, fit.fit, fit.fit.seed, fit.fit.n_iter)) # Compute connectivity matrix of factorization. # Again, we could use multiple runs support of the nimfa library, track factorization model across 50 runs and then # just call fit.consensus() consensus += fit.fit.connectivity() # averaging connectivity matrices consensus /= 50. # reorder consensus matrix p_consensus = reorder(consensus) # plot reordered consensus matrix plot(p_consensus, rank)
def factorize(data): """ Perform factorization on S. cerevisiae FunCat annotated sequence data set (D1 FC seq). Return factorized data, this is matrix factors as result of factorization (basis and mixture matrix). :param data: Transformed data set containing attributes' values, class information and possibly additional meta information. :type data: `tuple` """ V = data['attr'] """model = nimfa.mf(V, seed = "random_vcol", rank = 40, method = "nmf", max_iter = 75, initialize_only = True, update = 'euclidean', objective = 'fro')""" model = nimfa.mf(V, seed = "random_vcol", rank = 40, method = "snmf", max_iter = 5, initialize_only = True, version = 'l', eta = 1., beta = 1e-4, i_conv = 10, w_min_change = 0) print "Performing %s %s %d factorization ..." % (model, model.seed, model.rank) fit = nimfa.mf_run(model) print "... Finished" sparse_w, sparse_h = fit.fit.sparseness() print """Stats: - iterations: %d - KL Divergence: %5.3f - Euclidean distance: %5.3f - Sparseness basis: %5.3f, mixture: %5.3f""" % (fit.fit.n_iter, fit.distance(), fit.distance(metric = 'euclidean'), sparse_w, sparse_h) data['W'] = fit.basis() data['H'] = fit.coef() return data
def run(self): # TODO: estimate rank self.mask_bed() NMF_Run.run(self) import nimfa print repr(self.masked_matrix) print repr(self.masked_matrix.shape) self.fctr = nimfa.mf(numpy.matrix(self.masked_matrix), seed = "nndsvd", rank = self.nmf_rank, method = "bmf", max_iter = self.max_iter, initialize_only = True, lambda_w = 1.1, lambda_h = 1.1) self.fctr_res = nimfa.mf_run(self.fctr) print 'Rss: %5.4f' % self.fctr_res.fit.rss() print 'Evar: %5.4f' % self.fctr_res.fit.evar() print 'K-L divergence: %5.4f' % self.fctr_res.distance(metric = 'kl') print 'Sparseness, W: %5.4f, H: %5.4f' % self.fctr_res.fit.sparseness() print 'Iteration: %d' % self.fctr_res.n_iter
def probability_select(ratings, users, rank=9, user=None): matrix = sp.dok_matrix((len(users), len(users))) for k, v in ratings.items(): matrix[users[k[0]], users[k[1]]] = v matrix[users[k[1]], users[k[0]]] = v # Run sparse matrix factorisation factor = nimfa.mf(matrix, seed="random_c", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) result = nimfa.mf_run(factor) if len(ratings) >= len(users)**2: return # all items expanded if user is None: # Pick a user to expand user = min(users, key=lambda u: len([i for i in ratings if u in i])) # Clusters (F) clusters = result.basis() # Matrix (M) recommendations = result.fitted() # All rated users (U) user_rated = {i[0]: ratings[i] for i in ratings if user == i[1]} user_rated.update({i[1]: ratings[i] for i in ratings if user == i[0]}) # Affiliations (A) caff = [(sum(r * clusters[users[u], x] for u, r in user_rated.items())+1)/(len(user_rated)+1) for x in range(rank)] # Confidence (d) conf = sum(sum(clusters[users[u], x] for u in user_rated) for x in range(rank))/clusters.sum() # Cluster confidences (C) sums = clusters.sum(axis=0).tolist()[0] cconf = [sum(clusters[users[u], x] for u in user_rated)/sums[x] for x in range(rank)] cconf_norm = max(cconf) or 1 cconf = [i/cconf_norm for i in cconf] # Find the user with the highest affinity to cluster candidates = {i for i in users if i not in user_rated} candidate = max(candidates, key=lambda x: conf * recommendations[users[user], users[x]] + (1-conf) * (sum((1-cconf[i])*caff[i]*clusters[users[x], i] for i in range(rank))/rank)) return user, candidate
def run_icm(V): """ Run iterated conditional modes. :param V: Target matrix to estimate. :type V: :class:`numpy.matrix` """ rank = 10 pnrg = np.random.RandomState() model = nimfa.mf(V, seed="nndsvd", rank=rank, method="icm", max_iter=12, initialize_only=True, iiter=20, alpha=pnrg.randn(V.shape[0], rank), beta=pnrg.randn(rank, V.shape[1]), theta=0., k=0., sigma=1.) fit = nimfa.mf_run(model) print_info(fit)
def apply( self, X, k = 2 ): """ Apply NMF to the specified document-term matrix X. """ import nimfa self.W = None self.H = None initialize_only = self.max_iters < 1 if self.update == "euclidean": objective = "fro" else: objective = "div" alg = nimfa.mf(X, method = self.method, max_iter = self.max_iters, rank = k, seed = self.init_strategy, update = self.update, objective = objective, test_conv = self.test_conv ) res = nimfa.mf_run(alg) # TODO: fix try: self.W = res.basis().todense() self.H = res.coef().todense() except: self.W = res.basis() self.H = res.coef() # last number of iterations self.n_iter = res.n_iter
def clustered_select(ratings, users, rank=9, user=None): matrix = sp.dok_matrix((len(users), len(users))) for k, v in ratings.items(): matrix[users[k[0]], users[k[1]]] = v matrix[users[k[1]], users[k[0]]] = v # Run sparse matrix factorisation factor = nimfa.mf(matrix, seed="random_c", rank=rank, method="snmf", max_iter=12, initialize_only=True, version='r', eta=1., beta=1e-4, i_conv=10, w_min_change=0) result = nimfa.mf_run(factor) if len(ratings) >= len(users)**2: return # all items expanded if user is None: # Pick a user to expand user = min(users, key=lambda u: len([i for i in ratings if u in i])) # Pick a cluster clusters = result.basis() # Select all rated users user_rated = {i[0]: ratings[i] for i in ratings if user == i[1]} user_rated.update({i[1]: ratings[i] for i in ratings if user == i[0]}) cluster = max(range(rank), key=lambda x: (sum(r * clusters[users[u], x] for u, r in user_rated.items()) + 1) / (len(user_rated) + 1)) # Maximise A_u(c) # Find the user with the highest affinity to cluster candidates = {i for i in users if i not in user_rated} candidate = max(candidates, key=lambda x: clusters[users[x], cluster]) return user, candidate
V = np.matrix([[1, 2, 3], [4, 5, 6], [6, 7, 8]]) print V # Initialization callback function def init_info(model): print "Initialized basis matrix\n", model.basis() print "Initialized mixture matrix\n", model.coef() # ICM rank 3 algorithm # We specify callback_init parameter by passing a init_info function # Callback is called after initialization and prior to factorization in each run. fctr = nimfa.mf(V, seed="random_c", method="icm", max_iter=10, rank=3, callback_init=init_info) fctr_res = nimfa.mf_run(fctr) # Basis matrix. W = fctr_res.basis() print "Resulting basis matrix" print W # Mixture matrix. H = fctr_res.coef() print "Resulting mixture matrix" print H sm = fctr_res.summary()