def nmf_filter(field, nmodes, return_filter=False, **kwargs_nmf): """ Apply a Non-Negative Matrix Factorisation (NMF) filter to a field. This finds two non-negative matrices whose product approximates the (strictly non-negative) input signal. Uses `sklearn.decomposition.NMF`. For more details, see: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html Parameters: field (array_like): 3D array containing the field that the filter will be applied to. NOTE: This assumes that the 3rd axis of the array is frequency. nmodes (int): Number of eigenmodes to filter out. return_filter (bool, optional): Whether to also return the linear FG filter operator and coefficients. **kwargs_nmf (dict, optional): Keyword arguments for the `sklearn.decomposition.NMF` Returns: cleaned_field (array_like), transformer (sklearn.decomposition.NMF instance, optional): Foreground-filtered field and NMF filter object. - ``cleaned_field (array_like)``: Foreground-cleaned field. - ``transformer (sklearn.decomposition.NMF instance, optional)``: Contains the NMF filter. Only returned if `return_operator = True`. To get the foreground model, you can do the following: ``` x = field - mean_field # shape (Npix, Nfreq) x_trans = transformer.fit_transform(x.T) # mode amplitudes per pixel x_fg = transformer.inverse_transform(x_trans).T # foreground model ``` """ # Calculate freq-freq covariance matrix d = field.reshape((-1, field.shape[-1])).T # (Nfreqs, Nxpix * Nypix) # Calculate average spectrum (avg. over pixels, as a function of frequency) d_mean = np.mean(d, axis=-1)[:, np.newaxis] x = d # Build NMF model and get amplitudes for each mode per pixel transformer = NMF(n_components=nmodes, **kwargs_nmf) x_trans = transformer.fit_transform(x.T) # Construct foreground operator x_fg = transformer.inverse_transform(x_trans).T # Subtract foreground operator x_clean = (x - x_fg).T.reshape(field.shape) # Return FG-subtracted data (and, optionally, the NMF filter instance) if return_filter: return x_clean, transformer else: return x_clean
def test_perfect_separation_of_latents(): latents = data() W, H, mapping = generate_w_h(latents, n_users=100) nmf = NMF(solver='mu', init='custom', n_components=3) nmf.components_ = H nmf.n_components_ = H.shape[0] X = nmf.inverse_transform(W)
def nmf(data, n_components, norm=True, plot=False): """ Computes Non-Negative Matrix Factorization. :param data: :param n_components: Number of components. :param norm: If normalize by MinMaxScaler returned components :param plot: If plot. Default False. :return: """ if norm: data = MinMaxScaler().fit_transform(data) nmf = NMF(n_components=n_components) c = nmf.fit_transform(data) predictions = nmf.inverse_transform(c) explained_variance = explained_variance_score(data, predictions) if norm: c = MinMaxScaler().fit_transform(c) if plot: for i in range(0, c.shape[1]): plt.plot(c[:, i], label='%s_%s' % ("NMF", i)) return nmf, c, explained_variance
def main(): train_data, train_length = get_train_data() test_data, test_length, width, height = get_test_data() model = NMF(n_components=5, init='random', random_state=0) W = model.fit_transform(train_data) H = model.components_ compressed_images = model.transform(test_data) output_images = model.inverse_transform(compressed_images) output_length = len(output_images) rgb_length = int(output_length / 3) reconstruct_subimages = np.zeros([height * width, 25, 3], dtype=np.float32) for channels in range(3): reconstruct_subimages[:, :, int(channels)] = output_images[( rgb_length * channels):(rgb_length * (channels + 1)), :] all_image_rec = np.zeros([25, height, width, 3], dtype=np.float32) for x in range(width): for y in range(height): all_image_rec[:, y, x, :] = reconstruct_subimages[y * width + x, :] * 255 for numbers in range(25): all_image_rec[numbers, :, :, :] = cv2.cvtColor( all_image_rec[numbers, :, :, :], cv2.COLOR_BGR2RGB) cv2.imwrite("./Reconstruct/" + str(numbers + 1) + "_" + "NMF" + ".png", all_image_rec[numbers, :, :, :])
def nmf_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', setting='eval'): """ nmf_ratings Predicates """ nmf_model = NMF(n_components=50) observed_user_item_matrix = observed_ratings_df.loc[:, 'rating'].unstack( fill_value=0.5) truth_user_item_matrix = truth_ratings_df.loc[:, 'rating'].unstack() transformed_matrix = nmf_model.fit_transform(observed_user_item_matrix) predictions = pd.DataFrame(nmf_model.inverse_transform(transformed_matrix), index=observed_user_item_matrix.index, columns=observed_user_item_matrix.columns) # make predictions for the user item pairs in the truth frame predictions = predictions.reindex(truth_user_item_matrix.index, columns=truth_user_item_matrix.columns, fill_value=0.5).stack() predictions = predictions.clip(0, 1) write(predictions, 'nmf_rating_obs', fold, setting)
def test(cls, csv, K=3, dr='PCA'): ''' csv - A csv file without header. ''' from sklearn.decomposition import PCA, NMF from sklearn.random_projection import GaussianRandomProjection from sklearn.manifold import MDS, TSNE from sklearn.cluster import KMeans from sklearn.preprocessing import OneHotEncoder X = pd.read_csv(csv, header=None).values Z = None Xr = None if (dr == 'PCA'): pca = PCA(n_components=K) # keep the first K components pca.fit(X) Z = pca.transform(X) Xr = pca.inverse_transform(Z) elif (dr == 'NMF'): # make sure X is non-negative Xmin = np.min(X) if (Xmin < 0): X = X - Xmin nmf = NMF(n_components=K) # keep the first K components nmf.fit(X) Z = nmf.transform(X) Xr = nmf.inverse_transform(Z) if (Xmin < 0): Xr = Xr + Xmin elif (dr == 'RP'): grp = GaussianRandomProjection( n_components=K) # keep the first K components Z = grp.fit_transform(X) elif (dr == 'VQ'): kmeans = KMeans(n_clusters=K).fit(X) Xvq = kmeans.predict(X) H = kmeans.cluster_centers_ ohe = OneHotEncoder() Z = ohe.fit_transform(Xvq.reshape(-1, 1)).A Xr = Z @ H elif (dr == 'MDS'): mds = MDS(n_components=K) # keep the first K components Z = mds.fit_transform(X) elif (dr == 'TSNE'): tsne = MDS(n_components=K) # keep the first K components Z = tsne.fit_transform(X) elif (dr == 'IDENTITY'): # for this case, k is not used. Z = X Xr = X else: raise Exception("Invalid DR name") return cls(X, Z, Xr)
def test_nmf_inverse_transform(solver): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) m = NMF(solver=solver, n_components=4, init='random', random_state=0, max_iter=1000) ft = m.fit_transform(A) A_new = m.inverse_transform(ft) assert_array_almost_equal(A, A_new, decimal=2)
def test_nmf_inverse_transform(solver): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) m = NMF(solver=solver, n_components=4, init='random', random_state=0, max_iter=1000) ft = m.fit_transform(A) A_new = m.inverse_transform(ft) assert_array_almost_equal(A, A_new, decimal=2)
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) m = NMF(n_components=4, init="random", random_state=0) m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
def test_nmf_inverse_transform(): # Test that NMF.inverse_transform returns close values random_state = np.random.RandomState(0) A = np.abs(random_state.randn(6, 4)) for solver in ('pg', 'cd'): m = NMF(solver=solver, n_components=4, init='random', random_state=0) m.fit_transform(A) t = m.transform(A) A_new = m.inverse_transform(t) assert_array_almost_equal(A, A_new, decimal=2)
def test_nmf_on_masked_ratings(): _matrix_1 = nan_masked_from_ratings( [Rating(0, 3, 1.0), Rating(5, 3, 0.5), Rating(5, 8, 1.0)], rows=10, columns=10) _nmf = NMF(solver='mu', init='random', n_components=2) W = _nmf.fit_transform(_matrix_1) X = _nmf.inverse_transform(W) assert np.shape(X) == np.shape(_matrix_1) recommended = np.argsort(X[0]) assert recommended[-1] == 8 assert recommended[-2] == 3
def generate_array_data_file(): latents = data() # W = np.zeros(shape=(n_users + 1, n_components)) # H = np.zeros(shape=(n_components, n_items)) n_users = 100 W, H, mapping = generate_w_h(latents, n_users=n_users, use_random=True, sigma=.5, mean=-1.5) nmf = NMF(solver='mu', init='custom', n_components=3) nmf.components_ = H nmf.n_components_ = H.shape[0] X = nmf.inverse_transform(W) # returns a shape of n_users+1. Wipe these ratings since they are not yet determined. X[n_users, :] = np.nan file_contents = dedent(''' namespace MonsterMatch.CollaborativeFiltering { public static class MonsterMatchArrayData { // @formatter:off public const int UserCount = %d; public const int ItemCount = %d; public const int PlayerUserId = %d; public const int FactorCount = %d; public static readonly int[] ForProfiles = %s; public static readonly double[,] Data = %s; public static readonly double[,] Pu = %s; public static readonly double[,] Qi = %s; // @formatter:on } } ''') file_contents = file_contents % ( n_users + 1, len(latents), n_users, nmf.n_components_, 'new [] {' + ','.join([str(profile.index) for profile in latents]) + '}', csharp_repr_ndarray(X), csharp_repr_ndarray(W), csharp_repr_ndarray(H.T)) print(file_contents)
def main(): train_data, train_length = get_train_data() test_data, test_length, width, height = get_test_data() model = NMF(n_components=5, init='random', random_state=0) W = model.fit_transform(train_data) H = model.components_ compressed_images = model.transform(test_data) output_images = model.inverse_transform(compressed_images) output_length= len(output_images) rgb_length = int(output_length/3) reconstruct_subimages = np.zeros([height*width, 25, 3], dtype=np.float32) for channels in range(3): reconstruct_subimages[:, :, int(channels)] = output_images[(rgb_length*channels):(rgb_length*(channels+1)),:] all_image_rec = np.zeros([25,height,width,3], dtype=np.float32) for x in range(width): for y in range(height): all_image_rec[:,y,x,:] = reconstruct_subimages[y * width + x,:]*255 for numbers in range(25): all_image_rec[numbers, :, :, :] = cv2.cvtColor(all_image_rec[numbers, :, :, :], cv2.COLOR_BGR2RGB) cv2.imwrite("./Reconstruct/" + str(numbers+1) + "_" + "NMF" + ".png", all_image_rec[numbers, :, :, :])
def compute_scores(X): pca = PCA(svd_solver='auto') kpca = KernelPCA(fit_inverse_transform=True) ica = FastICA() nmf = NMF(init='nndsvda') pca_scores, ica_scores, nmf_scores, kpca_scores = [], [], [], [] for n in n_components: pca.n_components = n ica.n_components = n nmf.n_components = n kpca.n_components = n print(n) Xpca = pca.inverse_transform(pca.fit_transform(Xs)) pca_scores.append(explained_variance_score(Xs, Xpca)) Xica = ica.inverse_transform(ica.fit_transform(Xs)) ica_scores.append(explained_variance_score(Xs, Xica)) Xkpca = kpca.inverse_transform(kpca.fit_transform(Xs)) kpca_scores.append(explained_variance_score(Xs, Xkpca)) Xnmf = nmf.inverse_transform(nmf.fit_transform(X)) nmf_scores.append(explained_variance_score(X, Xnmf)) return pca_scores, ica_scores, nmf_scores, kpca_scores
new_cpsi = np.linspace(np.max( (exp_cpsi2.min(),sim_cpsi2.min()) )+0.05, np.min((exp_cpsi2.max(), sim_cpsi2.max()))-0.05, interp_num_phi,endpoint=False ) interp_cpsi[qidx] = new_cpsi interp_X = interp_shots(norm_X2, interp_num_phi, sim_cpsi2, new_cpsi) interp_pro = interp_shots(norm_GDPpro2, interp_num_phi, exp_cpsi2, new_cpsi) interp_buf = interp_shots(norm_buf2, interp_num_phi, exp_cpsi2, new_cpsi) # transform and inverse transform model = NMF(n_components=10,solver='cd') W=model.fit_transform(interp_X) H=model.components_ new_buf = model.transform(interp_buf) new_pro = model.transform(interp_pro) inverse_diff = model.inverse_transform(new_pro-new_buf) # average and error estimate pro[qidx] = inverse_diff.mean(0) err[qidx] = inverse_diff.std(0)/np.sqrt(inverse_diff.shape[0]) grp.create_dataset('ave_cor',data=pro) grp.create_dataset('err',data=err) grp.create_dataset('num_shots',data=inverse_diff.shape[0]) grp.create_dataset('interp_cpsi', data = interp_cpsi) grp.create_dataset('nnmf_n_components', data = model.n_components)
# W = samples x n_components (component mapping for each sample) fig = plt.figure(figsize=(8, 8)) rows = min(int(np.sqrt(train_samples)), 10) columns = min(int(train_samples / rows), 10) for i in range(0, columns * rows): img = W[i].reshape( (int(np.sqrt(nmf_components)), int(np.sqrt(nmf_components)))) fig.add_subplot(rows, columns, i + 1) plt.axis('off') plt.imshow(img) plt.show() # =============================================== # Reconstruction # =============================================== output = nmf.inverse_transform(W) fig = plt.figure(figsize=(8, 8)) rows = min(int(np.sqrt(train_samples)), 10) columns = min(int(train_samples / rows), 10) for i in range(0, columns * rows): img = output[i].reshape((28, 28)) fig.add_subplot(rows, columns, i + 1) plt.axis('off') plt.imshow(img) plt.show() # ===============================================
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer import onnx from skl2onnx.algebra.onnx_ops import (OnnxArrayFeatureExtractor, OnnxMul, OnnxReduceSum) from skl2onnx.common.data_types import FloatTensorType from onnxruntime import InferenceSession mat = np.array( [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64) mat[:mat.shape[1], :] += np.identity(mat.shape[1]) mod = NMF(n_components=2) W = mod.fit_transform(mat) H = mod.components_ pred = mod.inverse_transform(W) print("original predictions") exp = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): exp.append((i, j, pred[i, j])) print(exp) ####################### # Let's rewrite the prediction in a way it is closer # to the function we need to convert into ONNX. def predict(W, H, row_index, col_index):
class Operation: contentStrengthFramework = '' #used to store Pandas Framework corresponding to each content popularity contentStrengthRegionAndCountryWise = '' #used to store Pandas Framework corresponding to each content popularity specific to their region personCountryAndRegionDict = {} ratingsDf = '' contentBasedSimilarity = {} itemList = [] nmfmodel = '' def __init__(self): pass def setItemList(self, itemList): self.itemList = itemList def setContentBasedSimilarity(self, contentBasedSimilarity): self.contentBasedSimilarity = contentBasedSimilarity def performanceMatrixFormation(self, userContentInteractionFramework): self.contentStrengthFramework = userContentInteractionFramework.groupby( 'contentId')['eventStrength'].sum().sort_values( ascending=False).reset_index() self.contentStrengthRegionAndCountryWise = userContentInteractionFramework.groupby( ['userCountry', 'userRegion', 'contentId'])['eventStrength'].sum().sort_values( ascending=False).reset_index() def personDictFormation(self, userContentInteractionFramework): for i in range(len(userContentInteractionFramework)): try: personId = userContentInteractionFramework.loc[i]['personId'] # print(personId) if personId not in self.personCountryAndRegionDict: # print(personId,"first condition") self.personCountryAndRegionDict[personId] = {} self.personCountryAndRegionDict[personId]['Country'] = '' self.personCountryAndRegionDict[personId]['Region'] = '' if type(userContentInteractionFramework.loc[i] ['userCountry']) != float: # print(personId,"second condition") self.personCountryAndRegionDict[personId][ 'Country'] = userContentInteractionFramework.loc[i][ 'userCountry'] if type(userContentInteractionFramework.loc[i] ['userRegion']) != float: # print(personId,"third condition") self.personCountryAndRegionDict[personId][ 'Region'] = userContentInteractionFramework.loc[i][ 'userRegion'] except: print("error occured at i th iteration ", i) def matrixFactorization(self, userContentInteractionFrameworkTrain, numOfFactor): matrixDF = userContentInteractionFrameworkTrain.pivot( index='personId', columns='contentId', values='eventStrength').fillna(0) matrix = matrixDF.as_matrix() u, sigma, vT = svds(matrix, k=numOfFactor) sigma = numpy.diag(sigma) predictedMatrix = numpy.dot(numpy.dot(u, sigma), vT) self.ratingsDf = pd.DataFrame(predictedMatrix, columns=matrixDF.columns, index=list(matrixDF.index)).transpose() def matrixFactorizationCluster(self, userContentInteractionFrameworkTrain, numOfFactor, k=2): matrixDF = userContentInteractionFrameworkTrain.pivot( index='personId', columns='contentId', values='eventStrength').fillna(0) matrix = matrixDF.as_matrix() a = [] b = [] kmeans = KMeans(n_clusters=k, random_state=0).fit(matrix) for labels in kmeans.labels_: if labels == 1: b.append(labels) else: a.append(labels) u, sigma, vT = svds(matrix[a], k=numOfFactor) sigma = numpy.diag(sigma) predictedMatrixA = numpy.dot(numpy.dot(u, sigma), vT) u, sigma, vT = svds(matrix[b], k=numOfFactor) sigma = numpy.diag(sigma) predictedMatrixB = numpy.dot(numpy.dot(u, sigma), vT) predictedMatrix = numpy.zeros(shape=(matrix.shape)) predictedMatrix[a] = predictedMatrixA predictedMatrix[b] = predictedMatrixB self.ratingsDfCluster = pd.DataFrame(predictedMatrix, columns=matrixDF.columns, index=list( matrixDF.index)).transpose() def matrixFactorizationNMF(self, userContentInteractionFrameworkTrain, numOfFactor): matrixDF = userContentInteractionFrameworkTrain.pivot( index='personId', columns='contentId', values='eventStrength').fillna(0) matrix = matrixDF.as_matrix() self.nmfmodel = NMF(n_components=numOfFactor) W = self.nmfmodel.fit_transform(matrix) self.matrixnmf = self.nmfmodel.inverse_transform(W) self.matrixnmf = pd.DataFrame(self.matrixnmf, columns=matrixDF.columns, index=list(matrixDF.index)).transpose() def recommendation(self, personId, userContentInteractionFrameworkTrain, topk, isRegionMatter=True): contentList = userContentInteractionFrameworkTrain[ userContentInteractionFrameworkTrain['personId'] == personId]['contentId'].tolist() recommended = [] curk = topk # print(topk) # personId = int(personId) if personId in self.personCountryAndRegionDict and isRegionMatter: if self.personCountryAndRegionDict[personId]['Region'] != '': contentStrengthRegionAndCountryWise = self.contentStrengthRegionAndCountryWise[ -self.contentStrengthRegionAndCountryWise.contentId. isin(contentList)] recommendedRegionWise = contentStrengthRegionAndCountryWise[ contentStrengthRegionAndCountryWise['userRegion'] == self.personCountryAndRegionDict[personId] ['Region']]['contentId'].head(curk).tolist() recommended.extend(list(set(recommendedRegionWise))) curk = topk - len(recommended) # print('first condition',len(recommended),len(list(set(recommendedRegionWise))),curk,topk) if curk <= 0: return recommended contentList.extend(recommended) if self.personCountryAndRegionDict[personId]['Country'] != '': contentStrengthRegionAndCountryWise = self.contentStrengthRegionAndCountryWise[ -self.contentStrengthRegionAndCountryWise.contentId. isin(contentList)] recommendedCountryWise = contentStrengthRegionAndCountryWise[ contentStrengthRegionAndCountryWise['userCountry'] == self.personCountryAndRegionDict[personId] ['Country']]['contentId'].head(curk).tolist() recommendedCountryWise = list(set(recommendedCountryWise)) recommended.extend(recommendedCountryWise) contentList.extend(recommendedCountryWise) # print('second condition',len(recommended),len(list(set(recommendedCountryWise))),curk) curk = topk - len(recommended) if curk <= 0: return recommended # else: # print(personId,"not in dict") while (curk > 0): recommendedBasedOnPopularity = self.contentStrengthFramework[ -self.contentStrengthFramework.contentId. isin(contentList)]['contentId'].head(curk).tolist() recommendedBasedOnPopularity = list( set(recommendedBasedOnPopularity)) contentList.extend(recommendedBasedOnPopularity) recommended.extend(recommendedBasedOnPopularity) curk -= len(recommendedBasedOnPopularity) # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk) return recommended def cfRecommendation(self, personId, userContentInteractionFrameworkTrain, topk): contentList = userContentInteractionFrameworkTrain[ userContentInteractionFrameworkTrain['personId'] == personId]['contentId'].tolist() recommended = [] curk = topk # print(topk) # personId = int(personId) while (curk > 0): ratings = self.ratingsDf[personId].sort_values( ascending=False).reset_index() recommendedBasedOnMatrix = ratings[-ratings.contentId.isin( contentList)]['contentId'].head(curk).tolist() recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix)) contentList.extend(recommendedBasedOnMatrix) recommended.extend(recommendedBasedOnMatrix) curk -= len(recommendedBasedOnMatrix) # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk) return recommended def cfRecommendationCluster(self, personId, userContentInteractionFrameworkTrain, topk): contentList = userContentInteractionFrameworkTrain[ userContentInteractionFrameworkTrain['personId'] == personId]['contentId'].tolist() recommended = [] curk = topk # print(topk) # personId = int(personId) while (curk > 0): ratings = self.ratingsDfCluster[personId].sort_values( ascending=False).reset_index() recommendedBasedOnMatrix = ratings[-ratings.contentId.isin( contentList)]['contentId'].head(curk).tolist() recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix)) contentList.extend(recommendedBasedOnMatrix) recommended.extend(recommendedBasedOnMatrix) curk -= len(recommendedBasedOnMatrix) # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk) return recommended def cfRecommendationNMF(self, personId, userContentInteractionFrameworkTrain, topk): contentList = userContentInteractionFrameworkTrain[ userContentInteractionFrameworkTrain['personId'] == personId]['contentId'].tolist() recommended = [] curk = topk # print(topk) # personId = int(personId) while (curk > 0): ratings = self.matrixnmf[personId].sort_values( ascending=False).reset_index() recommendedBasedOnMatrix = ratings[-ratings.contentId.isin( contentList)]['contentId'].head(curk).tolist() recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix)) contentList.extend(recommendedBasedOnMatrix) recommended.extend(recommendedBasedOnMatrix) curk -= len(recommendedBasedOnMatrix) # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk) return recommended def evaluation(self, userContentInteractionFrameworkTrain, userContentInteractionFrameworkTest, topk, isRegionMatter=True, recommendartionType=1): toreturn = {} count = 0 for idx, i in enumerate( list(userContentInteractionFrameworkTest.index.unique().values) ): personId = userContentInteractionFrameworkTest.loc[i]['personId'] if recommendartionType == 1: predicted = self.recommendation( personId, userContentInteractionFrameworkTrain, topk, isRegionMatter) elif recommendartionType == 2: predicted = self.cfRecommendationNMF( personId, userContentInteractionFrameworkTrain, topk) elif recommendartionType == 3: predicted = self.cfRecommendationCluster( personId, userContentInteractionFrameworkTrain, topk) else: predicted = self.cfRecommendation( personId, userContentInteractionFrameworkTrain, topk) actual = userContentInteractionFrameworkTest[ userContentInteractionFrameworkTest['personId'] == personId]['contentId'].tolist() actual = list(set(actual)) toreturn[personId] = {} toreturn[personId]['predicted'] = predicted toreturn[personId]['actual'] = actual denom = topk if len(actual) < topk: denom = len(actual) numer = len(list(set(actual).intersection(predicted))) recall = numer / float(denom) toreturn[personId]['recall'] = recall toreturn[personId]['numerator'] = numer toreturn[personId]['denominator'] = denom # print(personId,recall) if personId not in self.personCountryAndRegionDict: count += 1 # break print( count, len(list( userContentInteractionFrameworkTest.index.unique().values))) return toreturn def contentBasedRecommendation(self, personId, userContentInteractionFrameworkTrain, topk): contentList = userContentInteractionFrameworkTrain[ userContentInteractionFrameworkTrain['personId'] == personId]['contentId'].tolist() recommended = [] curk = topk # print(topk) # personId = int(personId) ratings = self.contentBasedSimilarity[personId] recommended = [ x for _, x in sorted(zip(ratings, self.itemList), reverse=True) ] # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk) return recommended[:topk] def intersectionAmongList(self, listOfLists): toreturn = listOfLists[0] for lol in listOfLists: toreturn = list(set(toreturn).intersection(lol)) return lol # weights = [CFbased, contentBased, Country And Region wise popularityBased, popularityBased def hybridModelEvaluation(self, userContentInteractionFrameworkTrain, userContentInteractionFrameworkTest, topk, weights=[5, 2, 3, 0]): toreturn = {} count = 0 for idx, i in enumerate( list(userContentInteractionFrameworkTest.index.unique().values) ): personId = userContentInteractionFrameworkTest.loc[i]['personId'] predicted3 = self.recommendation( personId, userContentInteractionFrameworkTrain, topk, True) predicted4 = self.recommendation( personId, userContentInteractionFrameworkTrain, topk, False) predicted1 = self.cfRecommendation( personId, userContentInteractionFrameworkTrain, topk) try: predicted2 = self.contentBasedRecommendation( personId, userContentInteractionFrameworkTrain, topk) except: predicted2 = [] if len(predicted2) == 0: predicted = self.intersectionAmongList( [predicted1, predicted3, predicted4]) else: predicted = self.intersectionAmongList( [predicted1, predicted2, predicted3, predicted4]) predicted1 = list(set(predicted1) - set(predicted)) predicted2 = list(set(predicted2) - set(predicted)) predicted3 = list(set(predicted3) - set(predicted)) predicted4 = list(set(predicted4) - set(predicted)) if len(predicted) < topk: if len(predicted2) == 0: predicted.extend( self.intersectionAmongList([predicted1, predicted3])) else: predicted.extend( self.intersectionAmongList([predicted1, predicted3])) predicted1 = list(set(predicted1) - set(predicted)) predicted2 = list(set(predicted2) - set(predicted)) predicted3 = list(set(predicted3) - set(predicted)) predicted4 = list(set(predicted4) - set(predicted)) if len(predicted) < topk: remain = topk - len(predicted) for w, lol in zip( weights, [predicted1, predicted2, predicted3, predicted4]): if remain <= 0: predicted = predicted[:topk] else: res = math.ceil(0.1 * w * len(lol)) predicted.extend(lol[:res]) remain = topk - len(predicted) else: predicted = predicted[:topk] else: predicted = predicted[:topk] actual = userContentInteractionFrameworkTest[ userContentInteractionFrameworkTest['personId'] == personId]['contentId'].tolist() actual = list(set(actual)) toreturn[personId] = {} toreturn[personId]['predicted'] = predicted toreturn[personId]['actual'] = actual denom = topk if len(actual) < topk: denom = len(actual) numer = len(list(set(actual).intersection(predicted))) recall = numer / float(denom) toreturn[personId]['recall'] = recall toreturn[personId]['numerator'] = numer toreturn[personId]['denominator'] = denom # print(personId,recall) if personId not in self.personCountryAndRegionDict: count += 1 # break print( count, len(list( userContentInteractionFrameworkTest.index.unique().values))) return toreturn # weights = [CFbased, contentBased, Country And Region wise popularityBased, popularityBased def latestHybridModelEvaluation(self, userContentInteractionFrameworkTrain, userContentInteractionFrameworkTest, topk, weights=[5, 5]): toreturn = {} count = 0 for idx, i in enumerate( list(userContentInteractionFrameworkTest.index.unique().values) ): personId = userContentInteractionFrameworkTest.loc[i]['personId'] # predicted2 = self.recommendation(personId, userContentInteractionFrameworkTrain, topk*2, True) predicted1 = self.cfRecommendation( personId, userContentInteractionFrameworkTrain, topk * 2) predicted2 = self.cfRecommendationNMF( personId, userContentInteractionFrameworkTrain, topk * 2) predicted = self.intersectionAmongList([predicted1, predicted2]) predicted1 = list(set(predicted1) - set(predicted)) predicted2 = list(set(predicted2) - set(predicted)) if len(predicted) < topk: for w, lol in zip(weights, [predicted1, predicted2]): res = math.ceil(0.1 * w * len(lol)) predicted.extend(lol[:res]) remain = topk - len(predicted) if remain <= 0: predicted = predicted[:topk] else: predicted = predicted[:topk] actual = userContentInteractionFrameworkTest[ userContentInteractionFrameworkTest['personId'] == personId]['contentId'].tolist() actual = list(set(actual)) toreturn[personId] = {} toreturn[personId]['predicted'] = predicted toreturn[personId]['actual'] = actual denom = topk if len(actual) < topk: denom = len(actual) numer = len(list(set(actual).intersection(predicted))) recall = numer / float(denom) toreturn[personId]['recall'] = recall toreturn[personId]['numerator'] = numer toreturn[personId]['denominator'] = denom # print(personId,recall) # print(count, len(list(userContentInteractionFrameworkTest.index.unique().values))) return toreturn def globalRecallCalc(self, recallDict, comp=0.1): numer = 0.0 denom = 0.0 recallList = [] count = 0 total = 0 for key in recallDict: recallList.append(recallDict[key]['recall']) numer += recallDict[key]['numerator'] denom += recallDict[key]['denominator'] total += 1 if recallDict[key]['recall'] >= comp: count += 1 try: val = numer / denom except: val = 0 return val, numpy.mean(recallList), numpy.median( recallList), count / float(total) def plotBar(self, xlabel, ylabel, label, scores, title): plt.bar(numpy.arange(len(label)), scores) plt.xlabel(xlabel, fontsize=5) plt.ylabel(ylabel, fontsize=5) plt.xticks(scores, label, fontsize=5, rotation=45) plt.title(title) plt.savefig('project/' + title + '.jpg') plt.show() def conversionIntoGlobalFormatDictionary(self, actualDict, predictedDict, topk): toreturn = {} for personId in actualDict: try: predicted = predictedDict[personId] actual = actualDict[personId] toreturn[personId] = {} toreturn[personId]['predicted'] = predictedDict[personId] toreturn[personId]['actual'] = actualDict[personId] denom = topk if len(actual) < topk: denom = len(actual) numer = len(list(set(actual).intersection(predicted))) recall = numer / float(denom) toreturn[personId]['recall'] = recall toreturn[personId]['numerator'] = numer toreturn[personId]['denominator'] = denom except: pass return toreturn
print(tficfVectorsTrain.shape) print("test dataset") print(tficfVectorsTest.shape) print("") print("Reduce Dimension:LSI") svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42) lsiVectorsTrain = svd.fit_transform(tficfVectorsTrain) print(lsiVectorsTrain) print(lsiVectorsTrain.shape) b = svd.inverse_transform(lsiVectorsTrain) print("error:") print(np.linalg.norm(tficfVectorsTrain - b, ord='fro')) print("Apply on test dataset") print(tficfVectorsTest.shape) lsiVectorsTest = svd.transform(tficfVectorsTest) print(lsiVectorsTest) print(lsiVectorsTest.shape) print("") print("Reduce Dimension:NMF") nmf = NMF(n_components=50, init='random', random_state=0) nmfVectorsTrain = nmf.fit_transform(tficfVectorsTrain) print(nmfVectorsTrain) print(nmfVectorsTrain.shape) print("error:") b = nmf.inverse_transform(nmfVectorsTrain) print(b.shape) print(np.linalg.norm(tficfVectorsTrain - b, ord='fro')) print("Apply on test dataset") nmfVectorsTest = nmf.transform(tficfVectorsTest) print(nmfVectorsTest) print(nmfVectorsTest.shape)
np.min((exp_cpsi2.max(), sim_cpsi2.max())) - 0.05, interp_num_phi, endpoint=False) interp_X = interp_shots(X, interp_num_phi, sim_cpsi2, new_cpsi) interp_pro = interp_shots(shots, interp_num_phi, exp_cpsi2, new_cpsi) print('constructing filter...') # transform and inverse transform model = NMF(n_components=n_comp, solver='cd') W = model.fit_transform(interp_X) H = model.components_ new_pro = model.transform(interp_pro) inverse_pro = model.inverse_transform(new_pro) # average and error estimate pro[qidx] = inverse_pro.mean(0) err[qidx] = inverse_pro.std(0) / np.sqrt(inverse_pro.shape[0]) interp_cpsi[qidx] = new_cpsi grp.create_dataset('all_filtered_cor', data=inverse_pro) grp.create_dataset('W_matrix', data=new_pro) grp.create_dataset('H_matrix', data=H) f_out.create_dataset('ave_cor', data=pro) f_out.create_dataset('err', data=err) f_out.create_dataset('unfiltered_ave_cor', data=original_pro) f_out.create_dataset('unfiltered_err', data=original_err)
ix = np.argsort(X) X = X[ix] emis = emis[ix, :] OD = -np.log(1 - emis) pcaOD = PCA(whiten=True, n_components=48) ica = FastICA(n_components=36, max_iter=5000) ODIR = ica.fit_transform(OD) # Reconstruct signals OD2 = ica.inverse_transform(ODIR) emis2 = 1 - np.exp(-OD2) # Reconstruct signals A_ = ica.mixing_ # Get estimated mixing matrix nmf = NMF(n_components=48) ODNR = nmf.fit_transform(OD) OD2 = nmf.inverse_transform(ODNR) emis2 = 1 - np.exp(-OD2) N = 48 knots = np.linspace(X.min(), X.max(), N)[1:-1] tck = splrep(X, -np.log(emis[:, 350]), t=knots) t = tck[0] c = np.zeros((emis.shape[-1], tck[1].size)) k = tck[2] for ii in range(emis.shape[-1]): tck = splrep(X, -np.log(emis[:, ii]), t=knots) c[ii, :] = tck[1] def emisFcn(X, tck):
print() # Print information print("Clustering sparse data with k-means with k = 2...") print() # K-Means clustering with k = kvalue = 2 t0 = time() km = KMeans(n_clusters=kvalue, init='k-means++', max_iter=100, n_init=1, verbose=False) #km = MiniBatchKMeans(n_clusters=2, init = 'k-means++', n_init=1, init_size = 1000, batch_size = 1000, verbose = False) km.fit(X_nmf) original_space_centroids = svd1.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] terms = vectorizer.get_feature_names() cm = metrics.confusion_matrix(labels, km.labels_) # Print information print( "-------------------------Processing Finshed 3---------------------------") print("Cluster sparse data done with k-means with k = 2 in %fs" % (time() - t0)) print( "This k-means cluster with dimensionality reduction using NMF without non-linear transformation)" ) print("Top 10 terms per cluster:") for i in range(kvalue): print("Cluster %d:" % i, end='')
from sklearn.datasets import load_iris #载入数据 X, _ = load_iris(True) # 最重要的参数是n_components、alpha、l1_ratio、solver nmf = NMF( n_components=2, # k value,默认会保留全部特征 init= None, # W H 的初始化方法,包括'random' | 'nndsvd'(默认) | 'nndsvda' | 'nndsvdar' | 'custom'. solver='cd', # 'cd' | 'mu' #{'frobenius', 'kullback-leibler', 'itakura-saito'},一般默认就好 beta_loss='frobenius', tol=1e-4, # 停止迭代的极限条件 max_iter=200, # 最大迭代次数 random_state=None, alpha=0., # 正则化参数 l1_ratio=0., # 正则化参数 verbose=0, # 冗长模式 shuffle=False # 针对"cd solver" ) # -----------------函数------------------------ print('params:', nmf.get_params()) # 获取构造函数参数的值,也可以nmf.attr得到,所以下面我会省略这些属性 # 下面四个函数很简单,也最核心,例子中见 nmf.fit(X) W = nmf.fit_transform(X) W = nmf.transform(X) nmf.inverse_transform(W) # -----------------属性------------------------ H = nmf.components_ # H矩阵 print('reconstruction_err_', nmf.reconstruction_err_) # 损失函数值 print('n_iter_', nmf.n_iter_) # 实际迭代次数
def test_custom_nmf(self): mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64) mat[:mat.shape[1], :] += np.identity(mat.shape[1]) mod = NMF(n_components=2) W = mod.fit_transform(mat) H = mod.components_ def predict(W, H, row_index, col_index): return np.dot(W[row_index, :], H[:, col_index]) pred = mod.inverse_transform(W) exp = [] got = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): exp.append((i, j, pred[i, j])) got.append((i, j, predict(W, H, i, j))) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got)) assert max_diff <= 1e-5 def nmf_to_onnx(W, H): """ The function converts a NMF described by matrices *W*, *H* (*WH* approximate training data *M*). into a function which takes two indices *(i, j)* and returns the predictions for it. It assumes these indices applies on the training data. """ col = OnnxArrayFeatureExtractor(H, 'col') row = OnnxArrayFeatureExtractor(W.T, 'row') dot = OnnxMul(col, row) res = OnnxReduceSum(dot, output_names="rec") indices_type = np.array([0], dtype=np.int64) onx = res.to_onnx(inputs={'col': indices_type, 'row': indices_type}, outputs=[('rec', FloatTensorType((None, 1)))]) return onx model_onnx = nmf_to_onnx(W, H) sess = InferenceSession(model_onnx.SerializeToString()) def predict_onnx(sess, row_indices, col_indices): res = sess.run(None, {'col': col_indices, 'row': row_indices}) return res onnx_preds = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): row_indices = np.array([i], dtype=np.int64) col_indices = np.array([j], dtype=np.int64) pred = predict_onnx(sess, row_indices, col_indices)[0] onnx_preds.append((i, j, pred[0, 0])) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds)) assert max_diff <= 1e-5
def optimize_NMF_rank_fuv(data, n_samples, plot_output_dir, train_size=0.8, k_min_max=[2, 30]): k_range = range(k_min_max[0], k_min_max[1]) k_fuv_dict = {} k_fuv_dict['rep'] = [] k_fuv_dict['k'] = [] k_fuv_dict['fuv_vals'] = [] k_fuv_dict['error_variance'] = [] k_fuv_dict['non_zero_ratio'] = [] # k_fuv_dict['sparsity_var_ratio'] = [] k_fuv_dict['SS_err'] = [] group_dict = {} group_dict['rep_err_var'] = [] group_dict['reconstruct_X_test'] = [] group_dict['X_test_flat'] = [] group_dict['k'] = [] n_repeats = 15 for k in k_range: # group_dict['X_test_flat'] = [] # group_dict['reconstruct_X_test'] = [] for rep in range(n_repeats): # Generate test and train data model_indexes = list(range(n_samples)) train_indexes = np.random.choice(model_indexes, size=int(n_samples * train_size), replace=False) test_indexs = [i for i in model_indexes if i not in train_indexes] X_test = np.copy(data[test_indexs]) X_train = np.copy(data[train_indexes]) # perturb_mat = np.random.normal(0.0, scale=10, size=np.shape(X_train)) # X_train = np.random.normal(0.0, scale=10, size=np.shape(X_train)) X_train = abs(X_train) # Apply speckled mask mask = np.random.choice([0, 1], size=X_train.shape, p=[0.2, 0.8]).astype(np.bool) # mask = np.random.randint(0,2,size=X_train.shape, weights=[0.2, 0.8]).astype(np.bool) # print(mask) r = np.zeros(X_train.shape) X_train[mask] = r[mask] model = NMF(n_components=k, init='nndsvda', verbose=0, max_iter=100, tol=4e-18, l1_ratio=1).fit(X_train) # Transform test set and reconstruct W_test = model.transform(X_test) reconstruct_X_test = model.inverse_transform(W_test).reshape(1, -1) # reconstruct_X_test = np.round(reconstruct_X_test, decimals=0) # Flatten elements X_test_flat = np.copy(X_test).reshape(1, -1) X_test_mean = np.mean(X_test_flat) SS_err = np.sum((X_test_flat - reconstruct_X_test)**2) SS_tot = np.sum((X_test_flat - X_test_mean)**2) fuv = SS_err / SS_tot error_variance = np.mean(SS_err) sparsity = measure_sparseness(model.components_) k_fuv_dict['rep'].append(rep) k_fuv_dict['k'].append(k) k_fuv_dict['fuv_vals'].append(fuv) k_fuv_dict['error_variance'].append(error_variance) k_fuv_dict['SS_err'].append(SS_err) k_fuv_dict['non_zero_ratio'].append(sparsity) # k_fuv_dict['sparsity_var_ratio'].append( error_variance / sparsity) # group_dict['X_test_flat'].extend(X_test_flat) # group_dict['reconstruct_X_test'].extend(reconstruct_X_test) # X_test_mean = np.mean(group_dict['X_test_flat']) # X_test_flat = np.array(group_dict['X_test_flat']).reshape(1, -1) # reconstruct_X_test = np.array(group_dict['reconstruct_X_test']).reshape(1, -1) # SS_err = np.sum((X_test_flat - reconstruct_X_test)**2) # group_dict['rep_err_var'].append(SS_err) # group_dict['k'].append(k) print(k) df = pd.DataFrame(k_fuv_dict) group_dict.pop('X_test_flat', None) group_dict.pop('reconstruct_X_test', None) group_df = pd.DataFrame(group_dict) df.to_csv('fuv_vals.csv') width_inches = 200 / 25.4 height_inches = 150 / 25.4 fig, ax = plt.subplots(figsize=(width_inches, height_inches)) sns.lineplot(x='k', y='SS_err', data=df) # sns.lineplot(x='k', y='error_variance', data=df) plt.savefig(plot_output_dir + 'NMF_optim_SS_err.pdf', dpi=500, bbox_inches='tight') plt.close() fig, ax = plt.subplots(figsize=(width_inches, height_inches)) sns.lineplot(x='k', y='error_variance', data=df) # sns.lineplot(x='k', y='error_variance', data=df) plt.savefig(plot_output_dir + 'NMF_optim_err_var.pdf', dpi=500, bbox_inches='tight') plt.close() fig, ax = plt.subplots(figsize=(width_inches, height_inches)) sns.lineplot(x='k', y='non_zero_ratio', data=df) # sns.lineplot(x='k', y='error_variance', data=df) plt.savefig(plot_output_dir + 'NMF_sparsity.pdf', dpi=500, bbox_inches='tight') plt.close()
import numpy as np from sklearn.datasets import load_iris from sklearn.decomposition import NMF # For reproducibility np.random.seed(1000) if __name__ == '__main__': # Load iris dataset iris = load_iris() print('Irid dataset shape') print(iris.data.shape) # Perform a non-negative matrix factorization nmf = NMF(n_components=3, init='random', l1_ratio=0.1) Xt = nmf.fit_transform(iris.data) print('Reconstruction error') print(nmf.reconstruction_err_) print('Original Iris sample') print(iris.data[0]) print('Compressed Iris sample (via Non-Negative Matrix Factorization)') print(Xt[0]) print('Rebuilt sample') print(nmf.inverse_transform(Xt[0]))
print(f'NearestNeighbor -- time: {end}') # implements nearest neighbor using SVD start = time.time() SVD = TruncatedSVD(n_components=12, random_state=42) matrix = SVD.fit_transform(df) corr = np.corrcoef(matrix) for index, _ in df[:-1].iterrows(): res = corr[index].argsort()[-20:][::-1] recons_matrix = SVD.inverse_transform(matrix) err = mean_squared_error(df, recons_matrix) end = time.time() - start print(f'SVD -- err: {err}, time: {end}') # non-negative matrix factorization; just looking at time and error start = time.time() nmf = NMF(n_components=12, init='random', random_state=42) matrix = nmf.fit_transform(df) recons_matrix = nmf.inverse_transform(matrix) err = mean_squared_error(df, recons_matrix) end = time.time() - start print(f'NMF -- err: {err}, time: {end}')
class TensorDecomp: """ A class to represent a tensor object with decomposition and reconstruction methods. Attributes ---------- tensor : numpy.ndarray The tensor that is given to the class. memSize : int The size of the tensor in the memory before decomposition. decMemSize : int The size of the tensor in the memory after decomposition. decomp_time : float The time elapsed to decompose the tensor. decomp_type : str The __name__ of the provided func argument. memChange : float The relative change of memory requirement of the tensor after decomposition. Methods ------- decompose(func, *args, **kwargs): Decomposes the given tensor with the 'func' decomposition and computes the size in memory after decomposition. reconstruct(self): Reconstructs the decomposed tensor. error(func, x, y): Calculates the error between x and y with the given 'func' error handle. """ def __init__(self, tensor): self.tensor = tensor self.decMemSize = 0 if isinstance(self.tensor, sparse._coo.core.COO): print("Sparse!!!!") #self.memSize = tensor.data.nbytes + tensor.row.nbytes + tensor.col.nbytes self.memSize = tensor.nbytes else: self.memSize = tensor.nbytes def decompose(self, func, *args, **kwargs): """ Decomposes the tensor with the func argument decomposition type. Assigns the objects after decomposition to self.decomposed. Computes the decomposition time and assigns to self.decomp_time. Assigns the func argument as the decomposition type to self.decomp_type. Parameters ---------- self : object of class TensorDecomp type. Returns ------- None """ if func.__name__ not in decomp_list: print(f'Error! Given decomposition --> {func.__name__}') return elif func.__name__ == 'svd': ts = timer() self.decomposed = func(self.tensor) te = timer() self.decomp_time = te - ts self.decomp_type = func.__name__ elif func.__name__ == 'NMF': self.nmf_obj = NMF() ts = timer() self.decomposed = [] self.decomposed.append(self.nmf_obj.fit_transform(self.tensor)) self.decomposed.append(self.nmf_obj.components_) te = timer() self.decomp_time = te - ts self.decomp_type = func.__name__ elif args: ts = timer() self.decomposed = func(self.tensor, args[0]) te = timer() self.decomp_type = func.__name__ self.decomp_time = te - ts else: ts = timer() self.decomposed = func(self.tensor, **kwargs) te = timer() self.decomp_type = func.__name__ self.decomp_time = te - ts for array in self.decomposed: if isinstance(array, (np.ndarray)): self.decMemSize += array.nbytes for array in self.decomposed[1]: if isinstance(array, (np.ndarray)): self.decMemSize += array.nbytes # the tensor size change in memory self.memChange = (self.decMemSize - self.memSize) / self.memSize def reconstruct(self): """ Reconstructs the decomposed TensorDecomp object. Assigns the reconstructed tensor to self.recons attribute. Parameters ---------- self : object of class TensorDecomp type. Returns ------- None """ if self.decomp_type == 'svd': self.recons = self.decomposed[0] @ ( np.diag(self.decomposed[1]) @ self.decomposed[2]) elif self.decomp_type == 'NMF': self.recons = self.nmf_obj.inverse_transform(self.decomposed[0]) elif self.decomp_type == 'tucker': from tensorly import tucker_tensor as tt self.recons = tt.tucker_to_tensor(self.decomposed) elif self.decomp_type == 'parafac': from tensorly import cp_tensor as ct self.recons = ct.cp_to_tensor(self.decomposed) elif self.decomp_type == 'matrix_product_state': from tensorly import tt_tensor as tt self.recons = tt.tt_to_tensor(self.decomposed) elif self.decomp_type == 'clarkson_woodruff_transform': self.recons = self.decomposed def error(self, func, x, y): """ Computes the error between the original and reconstructed tensor with a given error function. Parameters ---------- func : function object for error calculation. Example: np.linalg.norm x : the original tensor y : the reconstructed tensor Returns ------- float the error between the original and the reconstructed tensor. """ if isinstance(x, sparse._coo.core.COO): # convert sparse matrix into dense matrix for error calc x = x.todense() return (func(x) - func(y)) / func(x)
image = imzmlio.normalize(image) else: image = np.uint8(image) image_shape = image.shape[:-1] image_norm = fusion.flatten(image, is_spectral=True) M = image_norm.T print(M.shape) if is_nmf: nmf = NMF(n_components=n, init='nndsvda', solver='cd', random_state=0) fit_nmf = nmf.fit(M) eigenvectors = fit_nmf.components_ #H eigenvalues = nmf.fit_transform(M) #W inverse_transform = nmf.inverse_transform(eigenvalues) eigenvectors_transposed = eigenvalues.T else: # p, n = M.shape pca = PCA(n) fit_pca = pca.fit(M) eigenvectors = fit_pca.components_ eigenvalues = fit_pca.transform(M) inverse_transform = pca.inverse_transform(eigenvalues) eigenvectors_transposed = eigenvalues.T mse = mean_squared_error(M, inverse_transform, multioutput='raw_values') outlier_indices = [i for i in range(len(mse))] outlier_indices.sort(key=lambda x: mse[x], reverse=True) number_outliers = 10
def fit_nmf(train_max, heldout_max=None, vocab=None, k=10, alpha_regularization=0.0): nmf = NMF(k, alpha=alpha_regularization, verbose=False) train_nmf = nmf.fit_transform(train_max) if heldout_max is not None: heldout_nmf = nmf.transform(heldout_max) else: heldout_nmf = None batch_size = 100 prop_train_reconst_errs = [] prop_heldout_reconst_errs = [] for iteration in range(20): train_idxes = np.random.choice(train_nmf.shape[0], batch_size, replace=True) reconst_train_max = nmf.inverse_transform(train_nmf[train_idxes]) prop_train_reconst_err = np.linalg.norm(reconst_train_max - train_max[train_idxes]) / \ scipy.sparse.linalg.norm(train_max[train_idxes]) prop_train_reconst_errs.append(prop_train_reconst_err) if heldout_nmf is not None: heldout_idxes = np.random.choice(heldout_nmf.shape[0], batch_size, replace=True) reconst_heldout_max = nmf.inverse_transform( heldout_nmf[heldout_idxes]) prop_heldout_reconst_err = np.linalg.norm(reconst_heldout_max - heldout_max[heldout_idxes]) / \ scipy.sparse.linalg.norm(heldout_max[heldout_idxes]) prop_heldout_reconst_errs.append(prop_heldout_reconst_err) else: prop_heldout_reconst_errs.append(-1.0) print('Train reconstruction error: {}'.format( np.mean(prop_train_reconst_errs))) print('Heldout reconstruction error: {}'.format( np.mean(prop_heldout_reconst_errs))) top_words_per_topic = get_top_words(nmf, vocab, n=20, subtract_off_mean=False, verbose=False) top_words_per_topic_womean = get_top_words(nmf, vocab, n=20, subtract_off_mean=True, verbose=False) topic_path = os.path.join( TOPIC_DIR, 'nmf-k{}-alpha{}.topics.txt'.format(k, alpha_regularization)) topic_womean_path = os.path.join( TOPIC_DIR, 'nmf-k{}-alpha{}.topics_without_mean.txt'.format( k, alpha_regularization)) topic_dist_path = os.path.join( TOPIC_DIR, 'nmf-k{}-alpha{}.topic_distribution_per_tweet.txt'.format( k, alpha_regularization)) model_path = os.path.join( TOPIC_DIR, 'nmf-k{}-alpha{}.model.pickle'.format(k, alpha_regularization)) # save top words per topic with open(topic_path, 'wt', encoding='utf8') as topic_file: for topic_idx, words in enumerate(top_words_per_topic): topic_file.write('Topic #{}:'.format(topic_idx)) for w in words: topic_file.write(' ') topic_file.write(w) topic_file.write('\n') with open(topic_womean_path, 'wt', encoding='utf8') as topic_file: for topic_idx, words in enumerate(top_words_per_topic_womean): topic_file.write('Topic #{}:'.format(topic_idx)) for w in words: topic_file.write(' ') topic_file.write(w) topic_file.write('\n') # save NMF model with open(model_path, 'wb') as model_file: pickle.dump(nmf, model_file) # save topic activation for each tweet, compressed numpy format if heldout_nmf is not None: all_nmf = np.concatenate((train_nmf, heldout_nmf), axis=0) else: all_nmf = train_nmf np.savez_compressed(topic_dist_path, topics_per_tweet=all_nmf)