def matdecomp(imregion, method): """Compute matrix decomposition Parameters ---------- imregion : 2D array The image region data method : str Options for method ('eigen', 'NMF') """ if method == 'eigen': ## columns are eigen vectors e_vals, e_vecs = LA.eig(imregion) return e_vecs if method == 'NMF': model = ProjectedGradientNMF(n_components=2, init='random',random_state=0) model.fit(imregion) comp = model.components_ err = model.reconstruction_err_ return comp
def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore = False self._extract_data_patches(X, zscore, log_amplitude) self.n_components = n_components nmf_args.setdefault('sparseness', 'components') nmf_args.setdefault('init', 'nndsvd') nmf_args.setdefault('beta', 0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
class NMF(method.Method): def __init__(self, params): self.params = params self.dec = ProjectedGradientNMF(**params) def __str__(self): return "Non-Negative matrix factorization by Projected Gradient (NMF)" def train(self, data): """ Train the NMF on the withened data :param data: whitened data, ready to use """ self.dec.fit(data) def encode(self, data): """ Encodes the ready to use data :returns: encoded data with dimension n_components """ return self.dec.transform(data) def decode(self, components): """ Decode the data to return whitened reconstructed data :returns: reconstructed data """ return self.dec.inverse_transform(components)
def init_rois(self, n_components=100, show=False): Ain, Cin, center = greedyROI2d(self.Y, nr=n_components, gSig=[2, 2], gSiz=[7, 7], use_median=False) Cn = np.mean(self.Y, axis=-1) if show: pl1 = pl.imshow(Cn, interpolation='none') pl.colorbar() pl.scatter(x=center[:, 1], y=center[:, 0], c='m', s=40) pl.axis((-0.5, self.Y.shape[1] - 0.5, -0.5, self.Y.shape[0] - 0.5)) pl.gca().invert_yaxis() active_pixels = np.squeeze(np.nonzero(np.sum(Ain, axis=1))) Yr = np.reshape(self.Y, (self.Y.shape[0] * self.Y.shape[1], self.Y.shape[2]), order='F') P = arpfit(Yr, p=2, pixels=active_pixels) Y_res = Yr - np.dot(Ain, Cin) model = ProjectedGradientNMF(n_components=1, init='random', random_state=0) model.fit(np.maximum(Y_res, 0)) fin = model.components_.squeeze() self.Yr, self.Cin, self.fin, self.Ain, self.P, self.Cn = Yr, Cin, fin, Ain, P, Cn
def get_cluster_membership(self): """ Determine the cluster number that each sample is associated with. """ model = ProjectedGradientNMF(n_components=self._num_clusters, init='random', beta=.3, eta=.5, max_iter=5000) w = model.fit_transform(self._matrix) h = model.components_ # convert the 'H' matrix, which represents weights for our data matrix W, into # an array representing cluster membership. Index of biggest value in each # col of matrix H is the cluster clusters = [] model_width = len(h[0]) for col_idx in range(model_width): max_val = dict() for row_idx in range(self._num_clusters): h_val = h[row_idx][col_idx] if not max_val or h_val > max_val['val']: max_val = {'row_idx': row_idx, 'val': h_val} clusters.append(max_val['row_idx']) # clusters array, w, h return (clusters, w, h)
def _nmf(X, K): nmf = ProjectedGradientNMF(n_components=K, max_iter=1000) nmf.fit(X) B = nmf.components_ A = np.dot(X, np.linalg.pinv(B)) return (A, B)
def nmfModel(matrix, nTopics): t=time() print "Starting Factorization" nmf = ProjectedGradientNMF(nTopics, max_iter=220, sparseness='data', init='nndsvd') W = nmf.fit_transform(matrix) H = nmf.components_ print "Factorization took %s minutes"%(round((time()-t)/60., 2)) return W, H, nmf
def nmf(self, k): nmf = ProjectedGradientNMF(n_components=k, max_iter=200) P = nmf.fit_transform(self.tdm) Q = nmf.components_.T self.P = P self.Q = Q self.er = nmf.reconstruction_err_ #print "\tError: ", self.er return P, Q
def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self
def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500, n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001): self.check_non_negtive(model) self.model = model super(NMFpredictor,self).__init__() self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter, n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, sparseness=sparseness,tol=tol) self.user_latent_M, self.item_latent_M = self.construct_latent_matrics()
def calcNMF(delta_data, components): data = preprocess(delta_data) nmf = ProjectedGradientNMF(n_components=components) x_nmf = nmf.fit_transform(data['cleanMatrix']) nmf_fill = np.ones((delta_data.shape[0],components))*np.nan nmf_fill[data['cleanind']] = x_nmf nmf_weights = nmf.components_.T delta_nmf = {'transform':nmf_fill, 'weights' : nmf_weights, } return delta_nmf
def __nmf_initialization(A, ncomms): try: from sklearn.decomposition import ProjectedGradientNMF except ImportError: print("sklearn module is missing.") return model = ProjectedGradientNMF(n_components=ncomms, init='nndsvd') Uin = np.asmatrix(model.fit_transform(A)) Vin = np.asmatrix(model.components_) Vin = Vin.T init_dict = {'U': Uin, 'V': Vin} return init_dict
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum): """Non-negative sparse dictionary learning from 2D spectrogram patches initialization: patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time) max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum """ def __init__(self, patch_size=(12, 12), max_samples=1000000): self.patch_size = patch_size self.max_samples = max_samples self.D = None self.data = None self.components = None self.zscore = False self.log_amplitude = False def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore = False self._extract_data_patches(X, zscore, log_amplitude) self.n_components = n_components nmf_args.setdefault('sparseness', 'components') nmf_args.setdefault('init', 'nndsvd') nmf_args.setdefault('beta', 0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model def reconstruct_spectrum(self, w=None, randomize=False): "reconstruct by fitting current NMF 2D dictionary to self.data" if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def test_projgrad_nmf_sparseness(): # Test sparseness # Test that sparsity constraints actually increase sparseness in the # part where they are applied. tol = 1e-2 A = np.abs(random_state.randn(10, 10)) m = ProjectedGradientNMF(n_components=5, random_state=0, tol=tol).fit(A) data_sp = ProjectedGradientNMF(n_components=5, sparseness='data', random_state=0, tol=tol).fit(A).data_sparseness_ comp_sp = ProjectedGradientNMF(n_components=5, sparseness='components', random_state=0, tol=tol).fit(A).comp_sparseness_ assert_greater(data_sp, m.data_sparseness_) assert_greater(comp_sp, m.comp_sparseness_)
def reducedim_nmf(self,factors): print "Number of factors is "+str(factors) model = ProjectedGradientNMF(n_components=factors,init='random',random_state=0) self.reducedmatrix= model.fit_transform(self.fullmatrix) #left factor w (n*k) h= model.components_ #right factor h (k*d) if self.testing: print self.fullmatrix print self.reducedmatrix print h v = numpy.dot(self.reducedmatrix,h) print v print "Completed NMF routine" for vector in self.vectordict.values(): vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
def train_model(self): print 'begin' RATE_MATRIX = np.zeros((9238, 7973)) for line in self.train.values: print line uid = int(float(line[1])) iid = int(float(line[2])) RATE_MATRIX[uid][iid] = int(float(line[3])) V = spr.csr_matrix(RATE_MATRIX) model = ProjectedGradientNMF(n_components=self.n_features, max_iter=1000, nls_max_iter=10000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose() print model.reconstruction_err_ self.ValidateF1() t = pd.DataFrame(np.array(self.pu)) t.to_csv('50pu') t = pd.DataFrame(np.array(self.qi)) t.to_csv('50qi') print("model generation over")
def recommend(matrix_3filled, matrix_raw, user, numOfNeighbors=5): # The following 3 lines uses Scikit-learn. For more information, refer to the documentation link in README. model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) model.fit(matrix_3filled) # transformed matrix is the result of non-negative matrix factorization, and we will use this for the recommendations transformed = np.dot(model.fit_transform(matrix_3filled), model.components_) neighbors=[] # Calculate distances from the current user to every other users. distances = np.sum((transformed-transformed[user])**2, axis=1) # Find nearest neighbors. for x in xrange(numOfNeighbors): distances[np.argmin(distances)] = sys.float_info.max neighbors.append(np.argmin(distances)) # Get an average for nearest neighbors. average is a vector containing the average rating for each humor. average=[0.0]*transformed.shape[1] for x in xrange(numOfNeighbors): average += transformed[neighbors[x]] average = average/numOfNeighbors # Find the unrated items for current users. unratedItems=[] for x in xrange(np.shape(matrix_raw)[1]): if matrix_raw[user][x] == 0: unratedItems.append(x) # If there are no unrated items, just return an item with max average rating. if len(unratedItems) is 0: item = np.argmax(average) return item # Else, return an unrated item with max average rating. else: maxAverage = 0 item = np.argmax(average) for x in xrange(len(unratedItems)): if average[unratedItems[x]] > maxAverage: maxAverage = average[unratedItems[x]] item = unratedItems[x] return item
def matrixFactorization(inmatrix, p_components=False): from sklearn.decomposition import PCA from sklearn.decomposition import ProjectedGradientNMF import pdb if p_components: p_comp = p_components else: pca = PCA(n_components=inmatrix.shape[1]) pca.fit(inmatrix) explained_variance = pca.explained_variance_ratio_.cumsum() explained_variance = explained_variance[explained_variance <= .9] p_comp = len(explained_variance) model = ProjectedGradientNMF(n_components=p_comp, init='nndsvd', beta=1, sparseness=None) #pdb.set_trace() model.fit(inmatrix) return model
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum): """Non-negative sparse dictionary learning from 2D spectrogram patches initialization: patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time) max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum """ def __init__(self, patch_size=(12,12), max_samples=1000000): self.patch_size = patch_size self.max_samples = max_samples self.D = None self.data = None self.components = None self.zscore=False self.log_amplitude=False def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore=False self._extract_data_patches(X, zscore, log_amplitude) self.n_components=n_components nmf_args.setdefault('sparseness','components') nmf_args.setdefault('init','nndsvd') nmf_args.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model def reconstruct_spectrum(self, w=None, randomize=False): "reconstruct by fitting current NMF 2D dictionary to self.data" if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self
def init_rois(self, n_components=100, show=False): Ain,Cin,center = greedyROI2d(self.Y, nr=n_components, gSig=[2,2], gSiz=[7,7], use_median=False) Cn = np.mean(self.Y, axis=-1) if show: pl1 = pl.imshow(Cn,interpolation='none') pl.colorbar() pl.scatter(x=center[:,1], y=center[:,0], c='m', s=40) pl.axis((-0.5,self.Y.shape[1]-0.5,-0.5,self.Y.shape[0]-0.5)) pl.gca().invert_yaxis() active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1))) Yr = np.reshape(self.Y,(self.Y.shape[0]*self.Y.shape[1],self.Y.shape[2]),order='F') P = arpfit(Yr, p=2, pixels=active_pixels) Y_res = Yr - np.dot(Ain,Cin) model = ProjectedGradientNMF(n_components=1, init='random', random_state=0) model.fit(np.maximum(Y_res,0)) fin = model.components_.squeeze() self.Yr,self.Cin,self.fin,self.Ain,self.P,self.Cn = Yr,Cin,fin,Ain,P,Cn
def reducedim_nmf(self, factors): print "Number of factors is " + str(factors) model = ProjectedGradientNMF(n_components=factors, init='random', random_state=0) self.reducedmatrix = model.fit_transform( self.fullmatrix) #left factor w (n*k) h = model.components_ #right factor h (k*d) if self.testing: print self.fullmatrix print self.reducedmatrix print h v = numpy.dot(self.reducedmatrix, h) print v print "Completed NMF routine" for vector in self.vectordict.values(): vector.array = sparse.csc_matrix( self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
class NMFSpectrum(SparseApproxSpectrum): def __init__(self, **kwargs): SparseApproxSpectrum.__init__(self,**kwargs) def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self def reconstruct_spectrum(self, w=None, randomize=False): if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def decomposition(V, W, H, n_components, solver='mu', update_H=True): if solver != 'project': W, H, _ = non_negative_factorization(V, W=W, H=H, n_components=n_components, update_H=update_H, max_iter=1000, solver=solver) #regularization='transformation', l1_ratio=0.1) else: model = ProjectedGradientNMF(n_components=n_components, init='random', random_state=0, sparseness='data', beta=0, max_iter=100000) model.fit(V) H = model.components_ W = model.fit_transform(V) return W, H
class NMFSpectrum(SparseApproxSpectrum): def __init__(self, **kwargs): SparseApproxSpectrum.__init__(self,**kwargs) def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self def reconstruct_spectrum(self, w=None, randomize=False): if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def _nmf_fixed_component(self, i, X): """ Uses sklearn to make the non negative factorization input: i, number of clusters for this NMF instance author: Arthur Desjardins """ model = ProjectedGradientNMF(n_components=i, init='nndsvd') model.fit(X) # H-matrix (clusters x words) H = model.components_ # W-matrix (documents x clusters) W = model.transform(X) # word matrix words = open(attributFile).read().split() # processing extremely basic cluster bush most_relevant_words = np.argmax(H, axis=1) docs_per_cluster = [0]*i for tweet in W: most_relevant_cluster = np.argmax(tweet) docs_per_cluster[most_relevant_cluster] += 1 clusters = dict(((words[most_relevant_words[i]], docs_per_cluster[i]) for i in range(0, i))) return clusters
def _nonNegativeFactorization(self): """ Uses sklearn to make the non negative factorization """ print 'Loading data..' X = np.asmatrix(np.loadtxt(dataFile)) print 'Data loaded. Making model..' model = ProjectedGradientNMF(init='nndsvd') print 'Fitting model..' model.fit(X) print 'Model fit' print 'Error rate is', model.reconstruction_err_ # H-matrix outFile1 = open(factoredHMatrix, 'w') np.savetxt(outFile1, model.components_, fmt='%i') outFile1.close # W-matrix outFile2 = open(factoredWMatrix, 'w') np.savetxt(outFile2, model.transform(X), fmt='%i') outFile2.close
def perform_nmf(X, w_dir): # factorize composition into components print "Performing NMF..." n_com = 48 model = ProjectedGradientNMF(n_components=n_com, sparseness='data', beta=1, eta=0.9, tol=0.000001, max_iter=2000, nls_max_iter=5000, random_state=None) model.fit(X) print model.reconstruction_err_ nmf_components = model.components_ print "done." # visualize Base Rules # nmf_components = project_data(nmf_components) f_name = w_dir + "base_rules_48.png" visualize_base_rules(nmf_components, n_com, f_name) return model
def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore=False self._extract_data_patches(X, zscore, log_amplitude) self.n_components=n_components nmf_args.setdefault('sparseness','components') nmf_args.setdefault('init','nndsvd') nmf_args.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model
class NMFpredictor(Predictor): def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500, n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001): self.check_non_negtive(model) self.model = model super(NMFpredictor,self).__init__() self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter, n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, sparseness=sparseness,tol=tol) self.user_latent_M, self.item_latent_M = self.construct_latent_matrics() def construct_latent_matrics(self): start = time.time() data_matrix = self.model.get_data_matrix() user_latent_M = self.nmf.fit_transform(data_matrix) item_latent_M = self.nmf.components_ print "use time: ", time.time() - start return user_latent_M, item_latent_M def predict(self,user_id, item_id): user_no = self.model.user_id_to_no[user_id] item_no = self.model.item_id_to_no[item_id] pref = np.dot(self.user_latent_M[user_no,:], self.item_latent_M[:,item_no]) if pref > self.model.max_pref: pref = self.model.max_pref if pref < self.model.min_pref: pref = self.model.min_pref return pref def check_non_negtive(self,model): if model.min_pref < 0: raise NotImplementedError("non_negtive!")
import numpy client = MongoClient('mongodb://localhost:27017/') mydb = client['movie_database'] movies = mydb.movies.find() i = 1 for movie in movies: print str(i)+" >> "+movie.get("title") +"--"+ movie.get("_id") i = i + 1 users = mydb.users.find() i = 1 for user in users: print str(i) + " >>" + user.get("_id") + "--" + user.get("password") activities = mydb.activity.find() i = 1 for activity in activities: print str(i) + " >>" + str(activity) A = numpy.random.uniform(size = [40, 30]) nmf_model = ProjectedGradientNMF(n_components = 5, init='random', random_state=0) W = nmf_model.fit_transform(A); H = nmf_model.components_; print W print H
fr = frame.drop('Email', 1) #NMF will not use email or total score fr = fr.drop('Total Score', 1) feature_names = fr.columns X = np.array(fr.astype(float)) '''for i in range(60): #Test error as a function of number of topics model = ProjectedGradientNMF(n_components=i, init='nndsvda',random_state=0,max_iter=500) model.fit(X) print (i,model.reconstruction_err_);''' model = ProjectedGradientNMF(n_components=11, init='nndsvda', random_state=0, max_iter=500) #Perform the NMF Xtrans = model.fit_transform(X) for topic_idx, topic in enumerate( model.components_ ): #Print the rubric items with strongest contribution in topics sorte = np.sort(topic)[::-1] sorteargs = np.argsort(topic)[::-1] i = 0 print("Topic #%d:" % topic_idx) while (sorte[i] > 1.5 ): #Only show things where contribution is large (1.5 is arbitrary) print feature_names[sorteargs[i]], np.mean( np.transpose(X)[sorteargs[i]]) / ptvals[feature_names[ sorteargs[i]]]
if ans != "y": exit() from sklearn.cluster import MiniBatchKMeans, KMeans km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1) km2 = KMeans(n_clusters=k, init='k-means++', verbose=1) y2 = km2.fit_transform(X) topics5 = [[(km.cluster_centers_[l][i], feature_names[i]) for i in np.argsort(-np.abs(km.cluster_centers_[l]))[:10]] for l in range(k)] print topics5 ### NMF ####################### ans = raw_input("Start NMF with Scikit ? ") if ans != "y": exit() from sklearn.decomposition import ProjectedGradientNMF # BEWARE : THIS IS COMPUTATIONNALY INTENSIVE nmf = ProjectedGradientNMF(n_components=k, max_iter=10, nls_max_iter=100) nmf.fit(X) topics6 = [[(nmf.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(nmf.components_[l]))[:10]] for l in range(k)]
def select_features_nmf(train_X, train_y, test_X, k): selector = ProjectedGradientNMF(n_components=k, init='nndsvd', random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
def main(): es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }]) index_name = "slclusters" if es_client.indices.exists(index_name): print("deleting '%s' index..." % (index_name)) print(es_client.indices.delete(index = index_name, ignore=[400, 404])) print("creating '%s' index..." % (index_name)) print(es_client.indices.create(index = index_name)) import re rr=re.compile(r"[\w']+") tok=lambda a:rr.findall(a) ff1=open('../docker/syslog.csv').readlines() aa=[] for d in ff1: #print(d) try: aa.append(json.loads(d)) except: continue print(len(aa)) # ff='\n'.join(ff1) docs=[] other=[] # aa=json.loads(ff) #print(aa) for iii,row in enumerate(aa): if len(tok(row['syslog_message']))>3: doc={} doc['created_at']=datetime.strptime(row["@timestamp"], "%Y-%m-%dT%H:%M:%S.000Z") doc['text']=row['syslog_message'] docs.append(doc['text']) other.append( doc['created_at'] ) print(doc['text']) print(tok(doc['text'])) print() if len(docs)>=100000: break cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5) # for iii,t in enumerate(tc): # print(iii,t) # if iii>100: # break M=cv.fit_transform(docs).astype(np.float) M2=Normalizer(copy=False).fit_transform(M) km=KMeans(n_clusters=30, init='k-means++', max_iter=200, n_init=5,\ verbose=True) km.fit_transform(M2) clusters=km.labels_ sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])] nmf=ProjectedGradientNMF(n_components=30) M3=nmf.fit_transform(M2) print(M3.shape) tDict={} maxInd=0 esDocs=[] for iii in sortInds: dd={} dd['message']=docs[iii] dd['cluster']=int(clusters[iii]) c2=tuple(np.argsort(M3[iii,:])[-1:]) if c2 in tDict: cc=tDict[c2] else: cc=maxInd tDict[c2]=maxInd maxInd=maxInd+1 dd['cluster2']=cc dd['created_at']=other[iii] esDocs.append(dd) #print(clusters[iii],other[iii],other[iii]) res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='syslogmsg', refresh=True)
def main(): es_client = Elasticsearch(hosts = [{ "host" : "localhost", "port" : 9200 }]) index_name = "twclusters" if es_client.indices.exists(index_name): print("deleting '%s' index..." % (index_name)) print(es_client.indices.delete(index = index_name, ignore=[400, 404])) print("creating '%s' index..." % (index_name)) print(es_client.indices.create(index = index_name)) from tokenizers import tokenize_nor,get_nor_stopwords tok=lambda a:tokenize_nor(a,get_nor_stopwords()) docs=[] other=[] conn=sqlite3.connect('../data/tweets.sqlite') cur=conn.execute('select * from T') for iii,row in enumerate(cur): doc={} doc['_id']=row[3] doc['created_at']=datetime.strptime(row[0], "%Y-%m-%d %H:%M:%S") doc['author_id']=row[1] doc['text']=row[4] doc['language']=row[5] if len(tok(doc['text']))>2: docs.append(doc['text']) other.append( (doc['created_at'],doc['author_id']) ) if len(docs)>=100000: break cur.close() cv=CountVectorizer(tokenizer=tok, max_df=0.5,min_df=5) # for iii,t in enumerate(tc): # print(iii,t) # if iii>100: # break M=cv.fit_transform(docs).astype(np.float) M2=Normalizer(copy=False).fit_transform(M) km=KMeans(n_clusters=20, init='k-means++', max_iter=200, n_init=5,\ verbose=True) km.fit_transform(M2) clusters=km.labels_ sortInds=[i[0] for i in sorted(enumerate(clusters), key=lambda x:x[1])] nmf=ProjectedGradientNMF(n_components=10) M3=nmf.fit_transform(M2) print(M3.shape) tDict={} maxInd=0 esDocs=[] for iii in sortInds: dd={} dd['tweet']=docs[iii] dd['cluster']=int(clusters[iii]) c2=tuple(np.argsort(M3[iii,:])[-2:]) if c2 in tDict: cc=tDict[c2] else: cc=maxInd tDict[c2]=maxInd maxInd=maxInd+2 dd['cluster2']=cc dd['created_at']=other[iii][1] dd['author_id']=other[iii][0] esDocs.append(dd) #print(clusters[iii],other[iii],other[iii]) res = helpers.bulk(es_client, esDocs, index=index_name, doc_type='tweet', refresh=True)
et = ExtraTreesClassifier() ab = AdaBoostClassifier() clf2 = svm.LinearSVC(penalty='l1', loss='l2', C=100, dual=False) clf = svm.SVC(kernel='rbf') logreg = linear_model.LogisticRegression(C=100, penalty='l2') knn = KNeighborsClassifier(n_neighbors=5) sgdc = SGDClassifier() gnb = GaussianNB() mnb = MultinomialNB() bnb = BernoulliNB() prcp = Perceptron() rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.02 rbm.n_iter = 20 rbm.n_components = 1000 NMF = ProjectedGradientNMF(n_components=2, init='random', random_state=0) PCA = PCA() LDA = LDA() #ICA = ICA() classifier = Pipeline(steps=[('rbm', rbm), ('logreg', logreg)]) file_handler_features = open('feature_vectors_heroes.csv', 'r') def unique(training_data, test_data): for item in training_data: if item in test_data: print 'Item in test' def hold_out(training_data, results):
model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter); print "nnmf start:",datetime.now(); W,H=model.fit_transform(mat); print "nnmf end:",datetime.now(); return W,H; if __name__=="__main__": if len(argv)!=3: print "usage:",argv[0],"datafile_prefix threshold"; else: t,users=load_index_map(argv[1]+".user"); t,brands=load_index_map(argv[1]+".brand"); clickMat=convert(argv[1]+".clk.lbm",len(users),len(brands)); buyMat=convert(argv[1]+".buy.lbm",len(users),len(brands)); testUCMat=convert("data/8.clk.lbm",len(users),len(brands)).todense(); testUBMat=convert("data/8.clk.lbm",len(users),len(brands)).todense(); model=ProjectedGradientNMF(n_components=50,init='nndsvd',tol=1e-8,max_iter=1000); print "nnmf start:",datetime.now(); #W,H=model.fit_transform(clickMat); W,H=model.fit_transform(buyMat); print "nnmf end:",datetime.now(); Y=np.dot(W,H); # prediction #cuMat=np.transpose(clickMat).todense(); #cbMat=cuMat.dot(buyMat.todense()); #buyPredict=np.dot(Y,cbMat); buyPredict=Y; #print "error=",norm(clickMat-Y); fout=open("/tmp/score","w"); for i in range(len(users)): content=users[i]; for j in range(len(brands)):
def nmf(mat,latentFactorNum=50,tol=1e-8,max_iter=1000): model=ProjectedGradientNMF(n_components=latentFactorNum,init='nndsvd',tol=tol,max_iter=max_iter); print "nnmf start:",datetime.now(); W,H=model.fit_transform(mat); print "nnmf end:",datetime.now(); return W,H;
# Split into training and test #answers_train, answers_test, cats_train, cats_test = train_test_split(answers, cats, test_size = 0.3)#, random_state=42) # Word counts count_vect = CountVectorizer(stop_words = 'english') answers_train = count_vect.fit_transform(answers_train) answers_test = count_vect.transform(answers_test) # Tf-idf tfidf_transformer = TfidfTransformer() answers_train = tfidf_transformer.fit_transform(answers_train) answers_test = tfidf_transformer.transform(answers_test) # NMF fit on training set print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape)) nmf = ProjectedGradientNMF(n_components = 100, max_iter=200) answers_train = nmf.fit_transform(answers_train) answers_test = nmf.transform(answers_test) # Fit SVM classifier print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape)) svc = svm.LinearSVC() svc.fit(answers_train, cats_train) print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100)) print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100)) mc_label = Counter(cats_train).most_common(1)[0][0] print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100)) # Metrics np.set_printoptions(linewidth=200, precision=3)
Cn = local_correlations(Y) plt1 = plt.imshow(Cn,interpolation='none') plt.colorbar() plt.scatter(x=center[:,1], y=center[:,0], c='m', s=40) plt.axis((-0.5,d2-0.5,-0.5,d1-0.5)) plt.gca().invert_yaxis() #%% crd = plot_contours(coo_matrix(Ain[:,::-1]),Cn,thr=0.9) #%% active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1))) Yr = np.reshape(Y,(d1*d2,T),order='F') p = 2; P = arpfit(Yr,p=1,pixels = active_pixels) Y_res = Yr - np.dot(Ain,Cin) model = ProjectedGradientNMF(n_components=1, init='random', random_state=0) model.fit(np.maximum(Y_res,0)) fin = model.components_.squeeze() #%% t1 = time() A,b,Cin = update_spatial_components(Yr, Cin, fin, Ain, d1=d1, d2=d2, sn = P['sn'],dist=2,max_size=8,min_size=3) t_elSPATIAL = time() - t1 #%% crd = plot_contours(A,Cn2,thr=0.9,cmap=pl.cm.gray) #%% t1 = time() C,f,Y_res,Pnew = update_temporal_components(Yr,A,b,Cin,fin,ITER=2,deconv_method = 'spgl1') t_elTEMPORAL2 = time() - t1 #%% t1 = time()
import numpy as np X = np.array([[1,1,2,3], [2, 1,4,5], [3, 2,4,5], [4, 1,2,1], [5, 4,3,1], [6, 1,4,3]]) from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) print model.fit(X) #ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, # n_components=2, nls_max_iter=2000, random_state=0, sparseness=None, # tol=0.0001) print model.components_ #array([[ 0.77032744, 0.11118662], # [ 0.38526873, 0.38228063]]) print model.reconstruction_err_ #0.00746... W = model.fit_transform(X); H = model.components_; print 'w: ' + str(W) print 'h: ' + str(H) model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0) print model.fit(X) #ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, # n_components=2, nls_max_iter=2000, random_state=0, # sparseness='components', tol=0.0001)
global indexes parser = argparse.ArgumentParser(description='Compute Non-negative Matrix Factorization') parser.add_argument('data_matrix', help='path to data file, should be readable by numpy') parser.add_argument('k', type=int, help='number of components to keep') parser.add_argument('feature_list', help='path to file containing list of feature names') parser.add_argument('index_file', help='path to array_index for this dataset') args = vars(parser.parse_args()) data = np.loadtxt(args['data_matrix']) k = args['k'] with open(args['feature_list']) as f: feature_list = map(str.rstrip, f.readlines()) indexes = np.loadtxt(args['index_file']) model = ProjectedGradientNMF(n_components=k, init='random', random_state=0) H = model.fit_transform(data) # H is submissions(row) by factors(cols) W = model.components_ # W is factors(row) by features(cols) magnitude = np.prod([np.sum(H, axis = 0), np.sum(W, axis = 1)], axis = 0) savetxt_3d(np.array(sort_by_row(W))[:, 0:20, :], 'nmf/factors_and_sorted_features.np', "factor") show_feature_name('nmf/factors_and_sorted_features.np', feature_list) subs_and_sorted_factors = sort_by_row(H) for sub in subs_and_sorted_factors: for factor in sub: factor[0] += 1 savetxt_3d(subs_and_sorted_factors, 'nmf/subs_and_sorted_factors.np', "submission") print "\n-------------- pattern of dominating factors ----------------\n"
import numpy as np X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=10, init='random', random_state=0) model.fit(X) print model.components_ U = X.dot(model.components_.T) print U print U.dot(model.components_) model.reconstruction_err_ model = ProjectedGradientNMF( n_components=2, sparseness='components', init='random', random_state=0) model.fit(X) ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, n_components=2, nls_max_iter=2000, random_state=0, sparseness='components', tol=0.0001) model.components_ model.reconstruction_err_
####THEIRS- not needed # Example data matrix X ###MINE X = DataFrame(matrix) X_imputed = X.copy() X = pa.DataFrame(matrix)# DataFrame(toy_vals, index = range(nrows), columns = range(ncols)) ###use some way to mask only a few vals.... thst too either 0 or 1 msk = (X.values + np.random.randn(*X.shape) - X.values) < 0.8 X_imputed.values[~msk] = 0 ##THEIRS # Hiding values to test imputation # Initializing model nmf_model = ProjectedGradientNMF(n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) nmf_model.fit(X_imputed.values) # iterate model #while nmf_model.reconstruction_err_**2 > 10: #nmf_model = NMF( n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) W = nmf_model.fit_transform(X_imputed.values) X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk] print nmf_model.reconstruction_err_ H = nmf_model.components_ rHat = np.dot(W,H) np.savetxt("rHat.txt" ,rHat)
import numpy as np X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=10, init='random', random_state=0) model.fit(X) print model.components_ U = X.dot(model.components_.T) print U print U.dot(model.components_) model.reconstruction_err_ model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0) model.fit(X) ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, n_components=2, nls_max_iter=2000, random_state=0, sparseness='components', tol=0.0001) model.components_ model.reconstruction_err_
def driver_movie_data_test_sklearn(train_filename,test_filename,k): (A,movie_ids,user_ids,m_count,u_count) = read_data(train_filename) # Do nnmf #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz) model = ProjectedGradientNMF(n_components=k) model.fit(A) V1 = model.components_ U1 = model.transform(A) print A.shape print U1.shape print V1.shape # Read test data (A,movie_ids,user_ids,m_count,u_count) = read_data(test_filename,movie_ids,user_ids,m_count,u_count,discard=True) (error,del_U,del_V,random_pairs) = evaluate_gradients(A,U1,V1,.07,16*A.nnz,hard=True) reverse_user = inverse_map(user_ids) reverse_movie = inverse_map(movie_ids) # Test on Ratings! outfile = open("test.sklearn.predictions","w") print ("Doing %d test ratings" % A.nnz) (n,m) = A.shape for row in xrange(n): for row_col_index in xrange(A.indptr[row],A.indptr[row+1]): col = A.indices[row_col_index] elt = A.data[row_col_index] print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) # Test on completely random pairs outfile = open("test.sklearn.rndpairs.predictions","w") for n_pairs in xrange(1000): row = r.randint(0,n-1) col = r.randint(0,m) print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) # Test on difficult distribution that ephasizes non-rated pairs where movies and users # are chosen based on rating count. outfile = open("test.sklearn.hard.rndpairs.predictions","w") for n_pairs in xrange(1000): i = r.randint(0,A.nnz -1) row = find_index(A.indptr,i) j = r.randint(0,A.nnz -1) col = A.indices[j] if (row > A.shape[0]-1): print row, A.shape, "what is going on" continue if (col > A.shape[1]-1): print col, A.shape, "what is going on" continue #print "shape,row,col", A.shape,row,col # if (A[row][col] > 0): # continue print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) print ("test rsme", math.sqrt(error)) for i in xrange(k): print ("Factor:", i) print_movie_factor(U1,reverse_movie, i) return(U1,V1,reverse_movie,reverse_user)
def filter_1sigma_nmf_new(dma, iter_date, df, header_df): print 'Get the 1-sigma filtered data' print df.shape[1] idx_vt = df.shape[1] - 1 mean_viewtime = df[idx_vt].mean() std_viewtime = df[idx_vt].std() print mean_viewtime / 3600.0, std_viewtime / 3600.0 reduced_df = df[(df[idx_vt] >= LOW_LIMIT) & (df[idx_vt] <= HIGH_LIMIT)].reset_index() print reduced_df.shape reduced_df[range(1, idx_vt)] = reduced_df[range(1, idx_vt)].div( 1.0 * reduced_df[idx_vt], 'index') dev_id_list = reduced_df[0] reduced_df_vsum = reduced_df[range(1, idx_vt)].sum() reduced_df_vsum = reduced_df_vsum[reduced_df_vsum > 0.00] idx_list = reduced_df_vsum.index.tolist() reduced_df_1 = reduced_df[range(1, idx_vt)][reduced_df_vsum.index.tolist()] # Select the header accordingly reduced_header_df = header_df[idx_list] #program_viewtime_array = np.array(reduced_df[range(1,idx_vt)].astype(np.float)) program_viewtime_array = np.array(reduced_df_1.astype(np.float)) program_name_array = np.array(reduced_header_df) t_program_viewtime_array = program_viewtime_array.transpose() cluster_num = 14 # Non-negative Matrix Factorization model = ProjectedGradientNMF(n_components=cluster_num, sparseness='data', init='nndsvd', max_iter=400, random_state=0) WW = model.fit_transform(t_program_viewtime_array) t_WW = WW.transpose() HH = model.components_ t_HH = HH.transpose() #print t_HH.shape #print pd.DataFrame(t_HH).head() membership = [-1 for item in range(0, t_HH.shape[0])] # Assign the membership for i in range(0, t_HH.shape[0]): membership[i] = np.argmax(t_HH[i]) dd = reduced_header_df print dd.shape print program_name_array.shape print program_viewtime_array.shape file = open( 'decompose_results_clusters_%s_%s_%s.csv' % (iter_date.month, iter_date.day, dma), 'w') file.write( 'Cluster_id,Dev_num,Household_num,Feature_val,Feature_fraction,Program_name\n' ) file.write( '-1,%s,%s,,,\n' % (len(dev_id_list), get_household_num(dma, dev_id_list.tolist()))) cluster_num = t_WW.shape[0] for i in range(0, cluster_num): dev_indices = [index for index, v in enumerate(membership) if v == i] dev_in_cluster = dev_id_list[dev_indices] dev_num = len(dev_in_cluster) household_num = get_household_num(dma, dev_in_cluster.tolist()) #print heapq.nlargest(10,t_WW[i]) feature_val = np.sort(t_WW[i]) feature_val = feature_val[::-1] #print 't_WW:',t_WW[i] #print 'sorted t_WW:',feature_val val_sum = np.sum(feature_val) feature_frac = feature_val * 1.0 / val_sum accumulated_frac = 0 cut_ind = 0 for frac in feature_frac: accumulated_frac += frac cut_ind += 1 if accumulated_frac > 0.6: break idx_list = np.argsort(t_WW[i])[::-1][:cut_ind] program_list = program_name_array[0][idx_list] for j in range(0, cut_ind): file.write('%s,%s,%s,%s,%s,%s\n' % (i, dev_num, household_num, feature_val[j], feature_frac[j], program_list[j])) #file.write(' '.join(program_name_array[0][idx_list])) #file.write('\n') file.close() #income_analysis(dma, dev_id_list, cluster_num, membership) #child_present_analysis(dma, dev_id_list, cluster_num, membership) #age_analysis(dma, dev_id_list, cluster_num, membership) clusters_obj = all_clusters(dma, cluster_num, dev_id_list, membership) return clusters_obj
row_info = train_data.iloc[row] curr_q, curr_u, label = row_info[0], row_info[1], row_info[2] #print curr_q, curr_u question_index = question_list.index(curr_q) user_index = expert_list.index(curr_u) matrix[question_index][user_index] = label #print matrix # In[57]: print 'running model...' model = ProjectedGradientNMF(n_components=50, init='nndsvda', random_state=0, max_iter=300, eta=0.01, alpha=0.01) W = model.fit_transform(matrix) H = model.components_ rHat = np.dot(W, H) print 'recon error: ', model.reconstruction_err_ #np.savetxt("rHat.txt",rHat) #pickle.dump(question_list, 'qList.txt') # np.savetxt("qList.txt",question_list) #np.savetxt( user_list,"uList.txt") # In[61]:
LC = out.tolist() X = [] Y = [] for i in LC: X.append(i[0]) Y.append(i[1]) cpmC = pca.components_ for i in range(len(cpmC[1])): if cpmC[1][i] * cpmC[1][i] > 0.04: print app[i] print i from sklearn.decomposition import ProjectedGradientNMF pca = ProjectedGradientNMF(n_components=2) out = pca.fit_transform(catBarr) LC = out.tolist() X = [] Y = [] Z = [] for i in LC: X.append(i[0]) Y.append(i[1]) cpmC = pca.components_ lis1 = cpmC[0].tolist() for i in range(len(lis1)):
def driver_movie_data_test_sklearn(train_filename, test_filename, k): (A, movie_ids, user_ids, m_count, u_count) = read_data(train_filename) # Do nnmf #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz) model = ProjectedGradientNMF(n_components=k) model.fit(A) V1 = model.components_ U1 = model.transform(A) print A.shape print U1.shape print V1.shape # Read test data (A, movie_ids, user_ids, m_count, u_count) = read_data(test_filename, movie_ids, user_ids, m_count, u_count, discard=True) (error, del_U, del_V, random_pairs) = evaluate_gradients(A, U1, V1, .07, 16 * A.nnz, hard=True) reverse_user = inverse_map(user_ids) reverse_movie = inverse_map(movie_ids) # Test on Ratings! outfile = open("test.sklearn.predictions", "w") print("Doing %d test ratings" % A.nnz) (n, m) = A.shape for row in xrange(n): for row_col_index in xrange(A.indptr[row], A.indptr[row + 1]): col = A.indices[row_col_index] elt = A.data[row_col_index] print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on completely random pairs outfile = open("test.sklearn.rndpairs.predictions", "w") for n_pairs in xrange(1000): row = r.randint(0, n - 1) col = r.randint(0, m) print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on difficult distribution that ephasizes non-rated pairs where movies and users # are chosen based on rating count. outfile = open("test.sklearn.hard.rndpairs.predictions", "w") for n_pairs in xrange(1000): i = r.randint(0, A.nnz - 1) row = find_index(A.indptr, i) j = r.randint(0, A.nnz - 1) col = A.indices[j] if (row > A.shape[0] - 1): print row, A.shape, "what is going on" continue if (col > A.shape[1] - 1): print col, A.shape, "what is going on" continue #print "shape,row,col", A.shape,row,col # if (A[row][col] > 0): # continue print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) print("test rsme", math.sqrt(error)) for i in xrange(k): print("Factor:", i) print_movie_factor(U1, reverse_movie, i) return (U1, V1, reverse_movie, reverse_user)
def __init__(self, params): self.params = params self.dec = ProjectedGradientNMF(**params)
genreMat4 = np.vstack(genreMat4) print genreMat4 index = filmsbygenre['Action'] E = y[index, :] ### K-Means ################### ans = raw_input("Start K-Means with Scikit ? ") if ans != "y": exit() from sklearn.cluster import MiniBatchKMeans, KMeans km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1) km2 = KMeans(n_clusters = k, init='k-means++', verbose=1) y2 = km2.fit_transform(X) topics5 = [[(km.cluster_centers_[l][i], feature_names[i]) for i in np.argsort(-np.abs(km.cluster_centers_[l]))[:10]] for l in range(k)] print topics5 ### NMF ####################### ans = raw_input("Start NMF with Scikit ? ") if ans != "y": exit() from sklearn.decomposition import ProjectedGradientNMF # BEWARE : THIS IS COMPUTATIONNALY INTENSIVE nmf = ProjectedGradientNMF(n_components=k, max_iter = 10, nls_max_iter=100) nmf.fit(X) topics6 = [[(nmf.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(nmf.components_[l]))[:10]] for l in range(k)]
__author__ = 'juliewe' import numpy as np #import sklearn.decomposition.NMF as NMF #implements C.J.Lin's projected gradient methods for NMF X = np.array([[1,1],[2,1],[3,1.2],[4,1],[5,0.8],[6,1]]) #n*d from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2,init='random',random_state=0) w= model.fit_transform(X) #left factor w (n*k) h= model.components_ #right factor h (k*d) print w print h v = np.dot(w,h) print v
t1 = time() Ain,Cin,center = greedyROI2d(Y, nr = nr, gSig = [4,4], gSiz = [9,9]) t_elGREEDY = time()-t1 #%% arpfit active_pixels = np.squeeze(np.nonzero(np.sum(Ain,axis=1))) Yr = np.reshape(Y,(d1*d2,T),order='F') p = 2; P = arpfit(Yr,p=2,pixels = active_pixels) #%% nmf Y_res = Yr - np.dot(Ain,Cin) model = ProjectedGradientNMF(n_components=1, init='random', random_state=0) model.fit(np.maximum(Y_res,0)) fin = model.components_.squeeze() #%% update spatial components t1 = time() A,b = update_spatial_components(Yr, Cin, fin, Ain, d1=d1, d2=d2, sn = P['sn']) t_elSPATIAL = time() - t1 #%% t1 = time() C,f,Y_res,Pnew = update_temporal_components(Yr,A,b,Cin,fin,ITER=2) t_elTEMPORAL1 = time() - t1
__author__ = 'juliewe' import numpy as np #import sklearn.decomposition.NMF as NMF #implements C.J.Lin's projected gradient methods for NMF X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) #n*d from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=2, init='random', random_state=0) w = model.fit_transform(X) #left factor w (n*k) h = model.components_ #right factor h (k*d) print w print h v = np.dot(w, h) print v