class NMF(method.Method): def __init__(self, params): self.params = params self.dec = ProjectedGradientNMF(**params) def __str__(self): return "Non-Negative matrix factorization by Projected Gradient (NMF)" def train(self, data): """ Train the NMF on the withened data :param data: whitened data, ready to use """ self.dec.fit(data) def encode(self, data): """ Encodes the ready to use data :returns: encoded data with dimension n_components """ return self.dec.transform(data) def decode(self, components): """ Decode the data to return whitened reconstructed data :returns: reconstructed data """ return self.dec.inverse_transform(components)
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum): """Non-negative sparse dictionary learning from 2D spectrogram patches initialization: patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time) max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum """ def __init__(self, patch_size=(12, 12), max_samples=1000000): self.patch_size = patch_size self.max_samples = max_samples self.D = None self.data = None self.components = None self.zscore = False self.log_amplitude = False def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore = False self._extract_data_patches(X, zscore, log_amplitude) self.n_components = n_components nmf_args.setdefault('sparseness', 'components') nmf_args.setdefault('init', 'nndsvd') nmf_args.setdefault('beta', 0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model def reconstruct_spectrum(self, w=None, randomize=False): "reconstruct by fitting current NMF 2D dictionary to self.data" if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum): """Non-negative sparse dictionary learning from 2D spectrogram patches initialization: patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time) max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum """ def __init__(self, patch_size=(12,12), max_samples=1000000): self.patch_size = patch_size self.max_samples = max_samples self.D = None self.data = None self.components = None self.zscore=False self.log_amplitude=False def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] log_amplitude - weather to apply log amplitude scaling log(1+X) **nmf_args - keyword arguments for ProjectedGradientNMF(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of 2D NMF components """ zscore=False self._extract_data_patches(X, zscore, log_amplitude) self.n_components=n_components nmf_args.setdefault('sparseness','components') nmf_args.setdefault('init','nndsvd') nmf_args.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args) self.model.fit(self.data) self.D = self.model def reconstruct_spectrum(self, w=None, randomize=False): "reconstruct by fitting current NMF 2D dictionary to self.data" if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
class NMFSpectrum(SparseApproxSpectrum): def __init__(self, **kwargs): SparseApproxSpectrum.__init__(self,**kwargs) def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print("NMF...") self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self def reconstruct_spectrum(self, w=None, randomize=False): if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
class NMFSpectrum(SparseApproxSpectrum): def __init__(self, **kwargs): SparseApproxSpectrum.__init__(self,**kwargs) def extract_codes(self, X, **kwargs): self.standardize=False self._extract_data_patches(X) kwargs.setdefault('sparseness','components') kwargs.setdefault('init','nndsvd') kwargs.setdefault('beta',0.5) print "NMF..." self.model = ProjectedGradientNMF(n_components=self.n_components, **kwargs) self.model.fit(self.data) self.D = self.model return self def reconstruct_spectrum(self, w=None, randomize=False): if w is None: self.w = self.model.transform(self.data) w = self.w return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
def _nmf_fixed_component(self, i, X): """ Uses sklearn to make the non negative factorization input: i, number of clusters for this NMF instance author: Arthur Desjardins """ model = ProjectedGradientNMF(n_components=i, init='nndsvd') model.fit(X) # H-matrix (clusters x words) H = model.components_ # W-matrix (documents x clusters) W = model.transform(X) # word matrix words = open(attributFile).read().split() # processing extremely basic cluster bush most_relevant_words = np.argmax(H, axis=1) docs_per_cluster = [0]*i for tweet in W: most_relevant_cluster = np.argmax(tweet) docs_per_cluster[most_relevant_cluster] += 1 clusters = dict(((words[most_relevant_words[i]], docs_per_cluster[i]) for i in range(0, i))) return clusters
def _nonNegativeFactorization(self): """ Uses sklearn to make the non negative factorization """ print 'Loading data..' X = np.asmatrix(np.loadtxt(dataFile)) print 'Data loaded. Making model..' model = ProjectedGradientNMF(init='nndsvd') print 'Fitting model..' model.fit(X) print 'Model fit' print 'Error rate is', model.reconstruction_err_ # H-matrix outFile1 = open(factoredHMatrix, 'w') np.savetxt(outFile1, model.components_, fmt='%i') outFile1.close # W-matrix outFile2 = open(factoredWMatrix, 'w') np.savetxt(outFile2, model.transform(X), fmt='%i') outFile2.close
def select_features_nmf(train_X, train_y, test_X, k): selector = ProjectedGradientNMF(n_components=k, init='nndsvd', random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
def driver_movie_data_test_sklearn(train_filename,test_filename,k): (A,movie_ids,user_ids,m_count,u_count) = read_data(train_filename) # Do nnmf #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz) model = ProjectedGradientNMF(n_components=k) model.fit(A) V1 = model.components_ U1 = model.transform(A) print A.shape print U1.shape print V1.shape # Read test data (A,movie_ids,user_ids,m_count,u_count) = read_data(test_filename,movie_ids,user_ids,m_count,u_count,discard=True) (error,del_U,del_V,random_pairs) = evaluate_gradients(A,U1,V1,.07,16*A.nnz,hard=True) reverse_user = inverse_map(user_ids) reverse_movie = inverse_map(movie_ids) # Test on Ratings! outfile = open("test.sklearn.predictions","w") print ("Doing %d test ratings" % A.nnz) (n,m) = A.shape for row in xrange(n): for row_col_index in xrange(A.indptr[row],A.indptr[row+1]): col = A.indices[row_col_index] elt = A.data[row_col_index] print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) # Test on completely random pairs outfile = open("test.sklearn.rndpairs.predictions","w") for n_pairs in xrange(1000): row = r.randint(0,n-1) col = r.randint(0,m) print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) # Test on difficult distribution that ephasizes non-rated pairs where movies and users # are chosen based on rating count. outfile = open("test.sklearn.hard.rndpairs.predictions","w") for n_pairs in xrange(1000): i = r.randint(0,A.nnz -1) row = find_index(A.indptr,i) j = r.randint(0,A.nnz -1) col = A.indices[j] if (row > A.shape[0]-1): print row, A.shape, "what is going on" continue if (col > A.shape[1]-1): print col, A.shape, "what is going on" continue #print "shape,row,col", A.shape,row,col # if (A[row][col] > 0): # continue print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row],reverse_user[col], nd.dot(U1[row,:],V1[:,col])) print ("test rsme", math.sqrt(error)) for i in xrange(k): print ("Factor:", i) print_movie_factor(U1,reverse_movie, i) return(U1,V1,reverse_movie,reverse_user)
# Word counts count_vect = CountVectorizer(stop_words = 'english') answers_train = count_vect.fit_transform(answers_train) answers_test = count_vect.transform(answers_test) # Tf-idf tfidf_transformer = TfidfTransformer() answers_train = tfidf_transformer.fit_transform(answers_train) answers_test = tfidf_transformer.transform(answers_test) # NMF fit on training set print("Fitting NMF on training word count matrix with shape" + str(answers_train.shape)) nmf = ProjectedGradientNMF(n_components = 100, max_iter=200) answers_train = nmf.fit_transform(answers_train) answers_test = nmf.transform(answers_test) # Fit SVM classifier print("Fitting SVM classifier on matrix with shape" + str(answers_train.shape)) svc = svm.LinearSVC() svc.fit(answers_train, cats_train) print("SVM train classification %: " + str(svc.score(answers_train, cats_train) * 100)) print("SVM test classification %: " + str(svc.score(answers_test, cats_test) * 100)) mc_label = Counter(cats_train).most_common(1)[0][0] print("Best guess % = " + str( float(Counter(cats_test)[mc_label]) / len(cats_test) * 100)) # Metrics np.set_printoptions(linewidth=200, precision=3) cats_pred = svc.predict(answers_test) #c = metrics.confusion_matrix(labels_test, csvm.predict(data_test))
def driver_movie_data_test_sklearn(train_filename, test_filename, k): (A, movie_ids, user_ids, m_count, u_count) = read_data(train_filename) # Do nnmf #(U1,V1) = hack_nmf_iter(A,k,.07,16*A.nnz) model = ProjectedGradientNMF(n_components=k) model.fit(A) V1 = model.components_ U1 = model.transform(A) print A.shape print U1.shape print V1.shape # Read test data (A, movie_ids, user_ids, m_count, u_count) = read_data(test_filename, movie_ids, user_ids, m_count, u_count, discard=True) (error, del_U, del_V, random_pairs) = evaluate_gradients(A, U1, V1, .07, 16 * A.nnz, hard=True) reverse_user = inverse_map(user_ids) reverse_movie = inverse_map(movie_ids) # Test on Ratings! outfile = open("test.sklearn.predictions", "w") print("Doing %d test ratings" % A.nnz) (n, m) = A.shape for row in xrange(n): for row_col_index in xrange(A.indptr[row], A.indptr[row + 1]): col = A.indices[row_col_index] elt = A.data[row_col_index] print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on completely random pairs outfile = open("test.sklearn.rndpairs.predictions", "w") for n_pairs in xrange(1000): row = r.randint(0, n - 1) col = r.randint(0, m) print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) # Test on difficult distribution that ephasizes non-rated pairs where movies and users # are chosen based on rating count. outfile = open("test.sklearn.hard.rndpairs.predictions", "w") for n_pairs in xrange(1000): i = r.randint(0, A.nnz - 1) row = find_index(A.indptr, i) j = r.randint(0, A.nnz - 1) col = A.indices[j] if (row > A.shape[0] - 1): print row, A.shape, "what is going on" continue if (col > A.shape[1] - 1): print col, A.shape, "what is going on" continue #print "shape,row,col", A.shape,row,col # if (A[row][col] > 0): # continue print >> outfile, "%s,%s,%0.5f" % (reverse_movie[row], reverse_user[col], nd.dot(U1[row, :], V1[:, col])) print("test rsme", math.sqrt(error)) for i in xrange(k): print("Factor:", i) print_movie_factor(U1, reverse_movie, i) return (U1, V1, reverse_movie, reverse_user)