def test_projgrad_nmf_sparseness(): # Test sparseness # Test that sparsity constraints actually increase sparseness in the # part where they are applied. tol = 1e-2 A = np.abs(random_state.randn(10, 10)) m = ProjectedGradientNMF(n_components=5, random_state=0, tol=tol).fit(A) data_sp = ProjectedGradientNMF(n_components=5, sparseness='data', random_state=0, tol=tol).fit(A).data_sparseness_ comp_sp = ProjectedGradientNMF(n_components=5, sparseness='components', random_state=0, tol=tol).fit(A).comp_sparseness_ assert_greater(data_sp, m.data_sparseness_) assert_greater(comp_sp, m.comp_sparseness_)
def get_cluster_membership(self): """ Determine the cluster number that each sample is associated with. """ model = ProjectedGradientNMF(n_components=self._num_clusters, init='random', beta=.3, eta=.5, max_iter=5000) w = model.fit_transform(self._matrix) h = model.components_ # convert the 'H' matrix, which represents weights for our data matrix W, into # an array representing cluster membership. Index of biggest value in each # col of matrix H is the cluster clusters = [] model_width = len(h[0]) for col_idx in range(model_width): max_val = dict() for row_idx in range(self._num_clusters): h_val = h[row_idx][col_idx] if not max_val or h_val > max_val['val']: max_val = {'row_idx': row_idx, 'val': h_val} clusters.append(max_val['row_idx']) # clusters array, w, h return (clusters, w, h)
def init_rois(self, n_components=100, show=False): Ain, Cin, center = greedyROI2d(self.Y, nr=n_components, gSig=[2, 2], gSiz=[7, 7], use_median=False) Cn = np.mean(self.Y, axis=-1) if show: pl1 = pl.imshow(Cn, interpolation='none') pl.colorbar() pl.scatter(x=center[:, 1], y=center[:, 0], c='m', s=40) pl.axis((-0.5, self.Y.shape[1] - 0.5, -0.5, self.Y.shape[0] - 0.5)) pl.gca().invert_yaxis() active_pixels = np.squeeze(np.nonzero(np.sum(Ain, axis=1))) Yr = np.reshape(self.Y, (self.Y.shape[0] * self.Y.shape[1], self.Y.shape[2]), order='F') P = arpfit(Yr, p=2, pixels=active_pixels) Y_res = Yr - np.dot(Ain, Cin) model = ProjectedGradientNMF(n_components=1, init='random', random_state=0) model.fit(np.maximum(Y_res, 0)) fin = model.components_.squeeze() self.Yr, self.Cin, self.fin, self.Ain, self.P, self.Cn = Yr, Cin, fin, Ain, P, Cn
def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore = False self._extract_data_patches(X, zscore, log_amplitude) self.n_components = n_components nmf_args.setdefault('sparseness', 'components') nmf_args.setdefault('init', 'nndsvd') nmf_args.setdefault('beta', 0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model
def _nmf(X, K): nmf = ProjectedGradientNMF(n_components=K, max_iter=1000) nmf.fit(X) B = nmf.components_ A = np.dot(X, np.linalg.pinv(B)) return (A, B)
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self
def __init__(self,model,beta=1, eta=0.1, init='nndsvd', max_iter=500, n_components=100, nls_max_iter=2000, random_state=0, sparseness=None,tol=0.0001): self.check_non_negtive(model) self.model = model super(NMFpredictor,self).__init__() self.nmf = ProjectedGradientNMF(beta=beta, eta=eta, init=init, max_iter=max_iter, n_components=n_components, nls_max_iter=nls_max_iter, random_state=random_state, sparseness=sparseness,tol=tol) self.user_latent_M, self.item_latent_M = self.construct_latent_matrics()
def __nmf_initialization(A, ncomms): try: from sklearn.decomposition import ProjectedGradientNMF except ImportError: print("sklearn module is missing.") return model = ProjectedGradientNMF(n_components=ncomms, init='nndsvd') Uin = np.asmatrix(model.fit_transform(A)) Vin = np.asmatrix(model.components_) Vin = Vin.T init_dict = {'U': Uin, 'V': Vin} return init_dict
def matrixFactorization(inmatrix, p_components=False): from sklearn.decomposition import PCA from sklearn.decomposition import ProjectedGradientNMF import pdb if p_components: p_comp = p_components else: pca = PCA(n_components=inmatrix.shape[1]) pca.fit(inmatrix) explained_variance = pca.explained_variance_ratio_.cumsum() explained_variance = explained_variance[explained_variance <= .9] p_comp = len(explained_variance) model = ProjectedGradientNMF(n_components=p_comp, init='nndsvd', beta=1, sparseness=None) #pdb.set_trace() model.fit(inmatrix) return model
def decomposition(V, W, H, n_components, solver='mu', update_H=True): if solver != 'project': W, H, _ = non_negative_factorization(V, W=W, H=H, n_components=n_components, update_H=update_H, max_iter=1000, solver=solver) #regularization='transformation', l1_ratio=0.1) else: model = ProjectedGradientNMF(n_components=n_components, init='random', random_state=0, sparseness='data', beta=0, max_iter=100000) model.fit(V) H = model.components_ W = model.fit_transform(V) return W, H
def reducedim_nmf(self, factors): print "Number of factors is " + str(factors) model = ProjectedGradientNMF(n_components=factors, init='random', random_state=0) self.reducedmatrix = model.fit_transform( self.fullmatrix) #left factor w (n*k) h = model.components_ #right factor h (k*d) if self.testing: print self.fullmatrix print self.reducedmatrix print h v = numpy.dot(self.reducedmatrix, h) print v print "Completed NMF routine" for vector in self.vectordict.values(): vector.array = sparse.csc_matrix( self.reducedmatrix[vector.rowindex]) print "Stored individual vectors"
def perform_nmf(X, w_dir): # factorize composition into components print "Performing NMF..." n_com = 48 model = ProjectedGradientNMF(n_components=n_com, sparseness='data', beta=1, eta=0.9, tol=0.000001, max_iter=2000, nls_max_iter=5000, random_state=None) model.fit(X) print model.reconstruction_err_ nmf_components = model.components_ print "done." # visualize Base Rules # nmf_components = project_data(nmf_components) f_name = w_dir + "base_rules_48.png" visualize_base_rules(nmf_components, n_com, f_name) return model
et = ExtraTreesClassifier() ab = AdaBoostClassifier() clf2 = svm.LinearSVC(penalty='l1', loss='l2', C=100, dual=False) clf = svm.SVC(kernel='rbf') logreg = linear_model.LogisticRegression(C=100, penalty='l2') knn = KNeighborsClassifier(n_neighbors=5) sgdc = SGDClassifier() gnb = GaussianNB() mnb = MultinomialNB() bnb = BernoulliNB() prcp = Perceptron() rbm = BernoulliRBM(random_state=0, verbose=True) rbm.learning_rate = 0.02 rbm.n_iter = 20 rbm.n_components = 1000 NMF = ProjectedGradientNMF(n_components=2, init='random', random_state=0) PCA = PCA() LDA = LDA() #ICA = ICA() classifier = Pipeline(steps=[('rbm', rbm), ('logreg', logreg)]) file_handler_features = open('feature_vectors_heroes.csv', 'r') def unique(training_data, test_data): for item in training_data: if item in test_data: print 'Item in test' def hold_out(training_data, results):
####THEIRS- not needed # Example data matrix X ###MINE X = DataFrame(matrix) X_imputed = X.copy() X = pa.DataFrame(matrix)# DataFrame(toy_vals, index = range(nrows), columns = range(ncols)) ###use some way to mask only a few vals.... thst too either 0 or 1 msk = (X.values + np.random.randn(*X.shape) - X.values) < 0.8 X_imputed.values[~msk] = 0 ##THEIRS # Hiding values to test imputation # Initializing model nmf_model = ProjectedGradientNMF(n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) nmf_model.fit(X_imputed.values) # iterate model #while nmf_model.reconstruction_err_**2 > 10: #nmf_model = NMF( n_components = 600, init='nndsvda', random_state=0,max_iter=300, eta=0.01, alpha = 0.01) W = nmf_model.fit_transform(X_imputed.values) X_imputed.values[~msk] = W.dot(nmf_model.components_)[~msk] print nmf_model.reconstruction_err_ H = nmf_model.components_ rHat = np.dot(W,H) np.savetxt("rHat.txt" ,rHat)
import numpy client = MongoClient('mongodb://localhost:27017/') mydb = client['movie_database'] movies = mydb.movies.find() i = 1 for movie in movies: print str(i)+" >> "+movie.get("title") +"--"+ movie.get("_id") i = i + 1 users = mydb.users.find() i = 1 for user in users: print str(i) + " >>" + user.get("_id") + "--" + user.get("password") activities = mydb.activity.find() i = 1 for activity in activities: print str(i) + " >>" + str(activity) A = numpy.random.uniform(size = [40, 30]) nmf_model = ProjectedGradientNMF(n_components = 5, init='random', random_state=0) W = nmf_model.fit_transform(A); H = nmf_model.components_; print W print H
LC = out.tolist() X = [] Y = [] for i in LC: X.append(i[0]) Y.append(i[1]) cpmC = pca.components_ for i in range(len(cpmC[1])): if cpmC[1][i] * cpmC[1][i] > 0.04: print app[i] print i from sklearn.decomposition import ProjectedGradientNMF pca = ProjectedGradientNMF(n_components=2) out = pca.fit_transform(catBarr) LC = out.tolist() X = [] Y = [] Z = [] for i in LC: X.append(i[0]) Y.append(i[1]) cpmC = pca.components_ lis1 = cpmC[0].tolist() for i in range(len(lis1)):
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print( "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(tfidf_matrix_train, km.labels_, sample_size=1000)) print() # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online') lda_Z = lda_model.fit_transform(data_vectorized) print(lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) pgnmf_model = ProjectedGradientNMF(n_components=NUM_TOPICS) pgnmf_z = pgnmf_model.fit_transform(data_vectorized) print(pgnmf_z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Non-Negative Matrix Factorization Model nmf_model = NMF(n_components=NUM_TOPICS) nmf_Z = nmf_model.fit_transform(data_vectorized) print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Latent Semantic Indexing Model lsi_model = TruncatedSVD(n_components=NUM_TOPICS) lsi_Z = lsi_model.fit_transform(data_vectorized) print(lsi_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Let's see how the first document in the corpus looks like in different topic spaces print(lda_Z[0])
# Split into training and test #answers_train, answers_test, cats_train, cats_test = train_test_split(answers, cats, test_size = 0.3)#, random_state=42) # Word counts count_vect = CountVectorizer(stop_words = 'english') answers_train = count_vect.fit_transform(answers_train) answers_test = count_vect.transform(answers_test) # Tf-idf tfidf_transformer = TfidfTransformer() answers_train = tfidf_transformer.fit_transform(answers_train) answers_test = tfidf_transformer.transform(answers_test) # NMF fit on training set print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape)) nmf = ProjectedGradientNMF(n_components = 100, max_iter=200) answers_train = nmf.fit_transform(answers_train) answers_test = nmf.transform(answers_test) # Fit SVM classifier print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape)) svc = svm.LinearSVC() svc.fit(answers_train, cats_train) print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100)) print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100)) mc_label = Counter(cats_train).most_common(1)[0][0] print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100)) # Metrics np.set_printoptions(linewidth=200, precision=3)
if ans != "y": exit() from sklearn.cluster import MiniBatchKMeans, KMeans km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1) km2 = KMeans(n_clusters=k, init='k-means++', verbose=1) y2 = km2.fit_transform(X) topics5 = [[(km.cluster_centers_[l][i], feature_names[i]) for i in np.argsort(-np.abs(km.cluster_centers_[l]))[:10]] for l in range(k)] print topics5 ### NMF ####################### ans = raw_input("Start NMF with Scikit ? ") if ans != "y": exit() from sklearn.decomposition import ProjectedGradientNMF # BEWARE : THIS IS COMPUTATIONNALY INTENSIVE nmf = ProjectedGradientNMF(n_components=k, max_iter=10, nls_max_iter=100) nmf.fit(X) topics6 = [[(nmf.components_[l][i], feature_names[i]) for i in np.argsort(-np.abs(nmf.components_[l]))[:10]] for l in range(k)]
fr = frame.drop('Email', 1) #NMF will not use email or total score fr = fr.drop('Total Score', 1) feature_names = fr.columns X = np.array(fr.astype(float)) '''for i in range(60): #Test error as a function of number of topics model = ProjectedGradientNMF(n_components=i, init='nndsvda',random_state=0,max_iter=500) model.fit(X) print (i,model.reconstruction_err_);''' model = ProjectedGradientNMF(n_components=11, init='nndsvda', random_state=0, max_iter=500) #Perform the NMF Xtrans = model.fit_transform(X) for topic_idx, topic in enumerate( model.components_ ): #Print the rubric items with strongest contribution in topics sorte = np.sort(topic)[::-1] sorteargs = np.argsort(topic)[::-1] i = 0 print("Topic #%d:" % topic_idx) while (sorte[i] > 1.5 ): #Only show things where contribution is large (1.5 is arbitrary) print feature_names[sorteargs[i]], np.mean( np.transpose(X)[sorteargs[i]]) / ptvals[feature_names[ sorteargs[i]]]
def driver_movie_data_test_sklearn(train_filename, test_filename, k): (A, movie_ids, user_ids, m_count, u_count) = read_data(train_filename) # Do nnmf #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz) model = ProjectedGradientNMF(n_components=k) model.fit(A) V1 = model.components_ U1 = model.transform(A) print A.shape print U1.shape print V1.shape # Read test data (A, movie_ids, user_ids, m_count, u_count) = read_data(test_filename, movie_ids, user_ids, m_count, u_count, discard=True) (error, del_U, del_V, random_pairs) = evaluate_gradients(A, U1, V1, .07, 16 * A.nnz, hard=True) reverse_user = inverse_map(user_ids) reverse_movie = inverse_map(movie_ids) # Test on Ratings! outfile = open("test.sklearn.predictions", "w") print("Doing %d test ratings" % A.nnz) (n, m) = A.shape for row in xrange(n): for row_col_index in xrange(A.indptr[row], A.indptr[row + 1]): col = A.indices[row_col_index] elt = A.data[row_col_index] print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on completely random pairs outfile = open("test.sklearn.rndpairs.predictions", "w") for n_pairs in xrange(1000): row = r.randint(0, n - 1) col = r.randint(0, m) print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on difficult distribution that ephasizes non-rated pairs where movies and users # are chosen based on rating count. outfile = open("test.sklearn.hard.rndpairs.predictions", "w") for n_pairs in xrange(1000): i = r.randint(0, A.nnz - 1) row = find_index(A.indptr, i) j = r.randint(0, A.nnz - 1) col = A.indices[j] if (row > A.shape[0] - 1): print row, A.shape, "what is going on" continue if (col > A.shape[1] - 1): print col, A.shape, "what is going on" continue #print "shape,row,col", A.shape,row,col # if (A[row][col] > 0): # continue print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) print("test rsme", math.sqrt(error)) for i in xrange(k): print("Factor:", i) print_movie_factor(U1, reverse_movie, i) return (U1, V1, reverse_movie, reverse_user)
import numpy as np X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) from sklearn.decomposition import ProjectedGradientNMF model = ProjectedGradientNMF(n_components=10, init='random', random_state=0) model.fit(X) print model.components_ U = X.dot(model.components_.T) print U print U.dot(model.components_) model.reconstruction_err_ model = ProjectedGradientNMF(n_components=2, sparseness='components', init='random', random_state=0) model.fit(X) ProjectedGradientNMF(beta=1, eta=0.1, init='random', max_iter=200, n_components=2, nls_max_iter=2000, random_state=0, sparseness='components', tol=0.0001) model.components_ model.reconstruction_err_
def filter_1sigma_nmf_new(dma, iter_date, df, header_df): print 'Get the 1-sigma filtered data' print df.shape[1] idx_vt = df.shape[1] - 1 mean_viewtime = df[idx_vt].mean() std_viewtime = df[idx_vt].std() print mean_viewtime / 3600.0, std_viewtime / 3600.0 reduced_df = df[(df[idx_vt] >= LOW_LIMIT) & (df[idx_vt] <= HIGH_LIMIT)].reset_index() print reduced_df.shape reduced_df[range(1, idx_vt)] = reduced_df[range(1, idx_vt)].div( 1.0 * reduced_df[idx_vt], 'index') dev_id_list = reduced_df[0] reduced_df_vsum = reduced_df[range(1, idx_vt)].sum() reduced_df_vsum = reduced_df_vsum[reduced_df_vsum > 0.00] idx_list = reduced_df_vsum.index.tolist() reduced_df_1 = reduced_df[range(1, idx_vt)][reduced_df_vsum.index.tolist()] # Select the header accordingly reduced_header_df = header_df[idx_list] #program_viewtime_array = np.array(reduced_df[range(1,idx_vt)].astype(np.float)) program_viewtime_array = np.array(reduced_df_1.astype(np.float)) program_name_array = np.array(reduced_header_df) t_program_viewtime_array = program_viewtime_array.transpose() cluster_num = 14 # Non-negative Matrix Factorization model = ProjectedGradientNMF(n_components=cluster_num, sparseness='data', init='nndsvd', max_iter=400, random_state=0) WW = model.fit_transform(t_program_viewtime_array) t_WW = WW.transpose() HH = model.components_ t_HH = HH.transpose() #print t_HH.shape #print pd.DataFrame(t_HH).head() membership = [-1 for item in range(0, t_HH.shape[0])] # Assign the membership for i in range(0, t_HH.shape[0]): membership[i] = np.argmax(t_HH[i]) dd = reduced_header_df print dd.shape print program_name_array.shape print program_viewtime_array.shape file = open( 'decompose_results_clusters_%s_%s_%s.csv' % (iter_date.month, iter_date.day, dma), 'w') file.write( 'Cluster_id,Dev_num,Household_num,Feature_val,Feature_fraction,Program_name\n' ) file.write( '-1,%s,%s,,,\n' % (len(dev_id_list), get_household_num(dma, dev_id_list.tolist()))) cluster_num = t_WW.shape[0] for i in range(0, cluster_num): dev_indices = [index for index, v in enumerate(membership) if v == i] dev_in_cluster = dev_id_list[dev_indices] dev_num = len(dev_in_cluster) household_num = get_household_num(dma, dev_in_cluster.tolist()) #print heapq.nlargest(10,t_WW[i]) feature_val = np.sort(t_WW[i]) feature_val = feature_val[::-1] #print 't_WW:',t_WW[i] #print 'sorted t_WW:',feature_val val_sum = np.sum(feature_val) feature_frac = feature_val * 1.0 / val_sum accumulated_frac = 0 cut_ind = 0 for frac in feature_frac: accumulated_frac += frac cut_ind += 1 if accumulated_frac > 0.6: break idx_list = np.argsort(t_WW[i])[::-1][:cut_ind] program_list = program_name_array[0][idx_list] for j in range(0, cut_ind): file.write('%s,%s,%s,%s,%s,%s\n' % (i, dev_num, household_num, feature_val[j], feature_frac[j], program_list[j])) #file.write(' '.join(program_name_array[0][idx_list])) #file.write('\n') file.close() #income_analysis(dma, dev_id_list, cluster_num, membership) #child_present_analysis(dma, dev_id_list, cluster_num, membership) #age_analysis(dma, dev_id_list, cluster_num, membership) clusters_obj = all_clusters(dma, cluster_num, dev_id_list, membership) return clusters_obj