def test_dict_learning_online_positivity(transform_algorithm, positive_code, positive_dict): rng = np.random.RandomState(0) n_components = 8 dico = MiniBatchDictionaryLearning( n_components, transform_algorithm=transform_algorithm, random_state=0, positive_code=positive_code, positive_dict=positive_dict).fit(X) code = dico.transform(X) if positive_dict: assert_true((dico.components_ >= 0).all()) else: assert_true((dico.components_ < 0).any()) if positive_code: assert_true((code >= 0).all()) else: assert_true((code < 0).any()) code, dictionary = dict_learning_online(X, n_components=n_components, alpha=1, random_state=rng, positive_dict=positive_dict, positive_code=positive_code) if positive_dict: assert_true((dictionary >= 0).all()) else: assert_true((dictionary < 0).any()) if positive_code: assert_true((code >= 0).all()) else: assert_true((code < 0).any())
def dictionay_learning_MHOF_online(training_samples_num=400): from MHOF_Extraction import MHOF_Extraction from MHOF_histogram_block import MHOF_histogram_block from sklearn.decomposition import MiniBatchDictionaryLearning import numpy as np import cv2 import video cam=video.create_capture('Crowd-Activity-All.avi') height_block_num=4 width_block_num=5 bin_num=16 ret,prev=cam.read() ret,img=cam.read() flow_H=MHOF_Extraction(prev,img) flow_hist_H=MHOF_histogram_block(flow_H,height_block_num,width_block_num,bin_num) flow_hist_H=np.reshape(flow_hist_H,[1,flow_hist_H.size]) # error!!!! dico=MiniBatchDictionaryLearning(1,alpha=1,n_iter=500) dic=dico.fit(flow_hist_H).components_ for i in range(training_samples_num): ret,img=cam.read() flow_H=MHOF_Extraction(prev,img) flow_hist_H=MHOF_histogram_block(flow_H,height_block_num,width_block_num,bin_num) dico=MiniBatchDictionaryLearing(i+1,alpha=1,n_iter=500,dict_init=dic) dic=dico.fit(flow_hist_H).components return dic
def main(games_path = None): if games_path == None: games_path = 'specmine/data/go_games/2010-01.pickle.gz' with specmine.util.openz(games_path) as games_file: games = pickle.load(games_file) boards = None # numpy array nx9x9 for game in games: if boards == None: boards = games[game].grids else: boards = numpy.vstack((boards,games[game].grids)) print 'boards shape: ', boards.shape boards = boards.reshape((boards.shape[0],-1)) print 'boards reshaped: ', boards.shape print 'Learning the dictionary... ' t0 = time() dico = MiniBatchDictionaryLearning(n_atoms=100, alpha=1, n_iter=500) V = dico.fit(boards).components_ dt = time() - t0 print 'done in %.2fs.' % dt #pl.figure(figsize=(4.2, 4)) for i, comp in enumerate(V[:100]): pl.subplot(10, 10, i + 1) pl.imshow(comp, cmap=pl.cm.gray_r) # interpolation='nearest') pl.xticks(()) pl.yticks(())
def scskl_dico_learning(list_pickled_array,n_atoms,maxepoch=5,maxiter=100): D = None for e in range(maxepoch): for a in list_pickled_array: data = joblib.load(a) dico = MiniBatchDictionaryLearning(n_components=n_atoms, n_iter=maxiter, dict_init=D) D = dico.fit(data).components_.astype(np.float32) return D
def sklearn_check(img, patch_size, dic_size, T=1000): patch_shape = (patch_size, patch_size) patches = extract_patches_2d(img, patch_shape) patches = patches.reshape(patches.shape[0], -1) patches = center(patches) dl = MiniBatchDictionaryLearning(dic_size, n_iter=T) dl.fit(patches) D = dl.components_.T return D
def to_sparse(X,dim): sparse_dict = MiniBatchDictionaryLearning(dim) sparse_dict.fit(X) sparse_vectors = sparse_encode(X, sparse_dict.components_) for i in sparse_vectors: print i return sparse_vectors
def create_dictionaries(n_codewords=20): dataset_features = np.load('MSR_Features_hog-hof-skel1360423760.27.dat') hogs = [] hofs = [] skels = [] for n in dataset_features.keys(): hogs += dataset_features[n]['hog'] hofs += dataset_features[n]['hof'] skels += [normalize_skeleton(dataset_features[n]['skel_world'])] ''' Input should be features[n_samples, n_features] ''' hogs = np.vstack(hogs) hofs = np.vstack(hofs) skels = np.vstack(skels) hog_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars') hog_dict.fit(hogs) hof_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars') hof_dict.fit(hofs) skels_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars') skels_dict.fit(skels) feature_dictionaries = {'hog':hog_dict, 'hof':hof_dict, 'skel':skels_dict} with open('MSR_Dictionaries_hog-hof-skel_%f.dat'%time.time(), 'wb') as outfile: pickle.dump(feature_dictionaries, outfile, protocol=pickle.HIGHEST_PROTOCOL)
class BOW_sparsecoding(BOW): def codebook(self): self.mbdl = MiniBatchDictionaryLearning(self.N_codebook) self.mbdl.fit(self.raw_features) def bow_feature_extract(self, path): des = self.raw_feature_extract(path) out = sum(sparse_encode(des, self.mbdl.components_)) out = np.array([out]) return out
def buildmodel2(): "生成有眼镜-无眼镜pair模型" modelrec = np.load('cut_rec.npy') modelglass = np.load('glassline.npy')[:modelrec.shape[0]] linkedmodel = np.empty((modelrec.shape[0],modelrec.shape[1]+modelglass.shape[1]),'f') linkedmodel[:,:modelrec.shape[1]]=modelrec linkedmodel[:,modelrec.shape[1]:]=modelglass #Train from sklearn.decomposition import MiniBatchDictionaryLearning learning = MiniBatchDictionaryLearning(500,verbose=True) learning.fit(linkedmodel) import cPickle cPickle.dump(learning,file('sparselinked','wb'),-1)
def extract_codes(self, X, standardize=False): self.standardize=standardize self._extract_data_patches(X) self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, alpha=1, n_iter=500) print "Dictionary learning from data..." self.D = self.dico.fit(self.data) return self
def fit(self, X, y=None): # compute the codes print 'Extracting patchs...' patchs = [] num = self.patch_num // X.size for x in X: img = imread(str(x[0])) tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \ max_patches=num, random_state=np.random.RandomState()) patchs.append(tmp) data = np.vstack(patchs) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data = data/np.std(data, axis=0) print 'Learning codebook...' if self.method == 'sc': self.dico = MiniBatchDictionaryLearning(n_components=self.codebook_size, \ alpha=1, n_iter=100, batch_size =100, verbose=True) self.dico.fit(data) elif self.method=='km': # self.dico = MiniBatchKMeans(n_clusters=self.codebook_size) pass return self
def learning_sparse_coding(X, components=None): """ http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.sparse_encode.html """ if components is None: print('Learning the dictionary...') t0 = time() diclearner = MiniBatchDictionaryLearning(n_components=100, verbose=True) components = diclearner.fit(X).components_ np.savetxt('components_of_convfeat.txt', components) dt = time() - t0 print('done in %.2fs.' % dt) codes = sparse_encode(X, components) np.savetxt('sparse_codes_of_convfeat.txt', codes)
def test_dict_learning_online_partial_fit(): # this test was not actually passing before! raise SkipTest n_components = 12 rng = np.random.RandomState(0) V = rng.randn(n_components, n_features) # random init V /= np.sum(V ** 2, axis=1)[:, np.newaxis] dico1 = MiniBatchDictionaryLearning(n_components, n_iter=10, batch_size=1, shuffle=False, dict_init=V, random_state=0).fit(X) dico2 = MiniBatchDictionaryLearning(n_components, n_iter=1, dict_init=V, random_state=0) for ii, sample in enumerate(X): dico2.partial_fit(sample, iter_offset=ii * dico2.n_iter) # if ii == 1: break assert_true(not np.all(sparse_encode(X, dico1.components_, alpha=100) == 0)) assert_array_equal(dico1.components_, dico2.components_)
def train_sparse_coding(feature_list, patch_list, dict_size=256, transform_alpha=0.5, n_iter=50): """ 使用mini batch训练稀疏编码 #feature_list 表示要训练的特征的列表 #patch_list 表示结果patch的列表 :return sc_list """ sc_list = [] i = 0 for feature, patch in zip(feature_list, patch_list): i = i + 1 ''' 由于组合数值大小比例的问题,稀疏编码可能忽略较小的特征,下面的×10需要用别的特征归一化方法代替 相关性越大,则每个向量都是有用的,所以需要更长的时间进行训练。 ''' dico = None X = np.concatenate((feature, patch), axis=1) if len(X) > 100000: np.random.shuffle(X) X = X[:90000] if len(X) < 5000: print "进入DictionaryLearning状态" dico = MiniBatchDictionaryLearning(batch_size=1000, transform_algorithm='lasso_lars', fit_algorithm='lars', transform_n_nonzero_coefs=5, n_components=len(X)/50, dict_init=X[:len(X)/50], n_iter=n_iter, transform_alpha=transform_alpha, verbose=10, n_jobs=-1) else: print "进入MiniBatchDictionaryLearning状态" dico = MiniBatchDictionaryLearning(batch_size=1000, transform_algorithm='lasso_lars', fit_algorithm='lars', transform_n_nonzero_coefs=5, n_components=len(X)/50, dict_init=X[:len(X)/50], n_iter=n_iter, transform_alpha=transform_alpha, verbose=10, n_jobs=-1) V = dico.fit(X).components_ sc_list.append(V) file_name = "./tmp_file/_tmp_sc_list_new_clsd_raw_%d.pickle" % (i) sc_file = open(file_name, 'wb') cPickle.dump(sc_list, sc_file, 1) sc_file.close() return sc_list
def test_dict_learning_online_partial_fit(): n_components = 12 rng = np.random.RandomState(0) V = rng.randn(n_components, n_features) # random init V /= np.sum(V ** 2, axis=1)[:, np.newaxis] dict1 = MiniBatchDictionaryLearning(n_components, n_iter=10 * len(X), batch_size=1, alpha=1, shuffle=False, dict_init=V, random_state=0).fit(X) dict2 = MiniBatchDictionaryLearning(n_components, alpha=1, n_iter=1, dict_init=V, random_state=0) for i in range(10): for sample in X: dict2.partial_fit(sample[np.newaxis, :]) assert not np.all(sparse_encode(X, dict1.components_, alpha=1) == 0) assert_array_almost_equal(dict1.components_, dict2.components_, decimal=2)
def extract_codes(self, X, n_components=16, zscore=True, log_amplitude=True, **mbl_args): """Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data inputs: X - spectrogram data (frequency x time) n_components - how many components to extract [16] zscore - whether to zscore the ensemble of 2D patches [True] log_amplitude - whether to apply log(1+X) scaling of spectrogram data [True] **mbl_args - keyword arguments for MiniBatchDictionaryLearning.fit(...) [None] outputs: self.data - 2D patches of input spectrogram self.D.components_ - dictionary of learned 2D atoms for sparse coding """ self._extract_data_patches(X, zscore, log_amplitude) self.n_components = n_components self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, **mbl_args) print "Dictionary learning from data..." self.D = self.dico.fit(self.data)
def __init__(self, hierarchy, depth, patch_size, num_features, num_patches, multiplier): """ * depth - hierarchy level (1, 2, 3, etc.) * patch_size - number of pixels representing side of the square patch. like, 8 (8x8 patches) * num_features - how many components to learn * multiplier - num of subpatches we break patch into (0 for the first level). if 3, patch will contant 3x3 subpatches. """ self.hierarchy = hierarchy self.depth = depth self.basement_size = patch_size self.num_features = num_features self.num_patches = num_patches self.multiplier = multiplier self.learning = MiniBatchDictionaryLearning( n_components=num_features, n_iter=3000, transform_algorithm='lasso_lars', transform_alpha=0.5, n_jobs=2) self.ready = False
def test_dict_learning_online_verbosity(): n_components = 5 # test verbosity from cStringIO import StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1) dico.fit(X) dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2) dico.fit(X) dict_learning_online(X, n_components=n_components, alpha=1, verbose=1) dict_learning_online(X, n_components=n_components, alpha=1, verbose=2) sys.stdout = old_stdout assert_true(dico.components_.shape == (n_components, n_features))
def test_dict_learning_online_verbosity(): n_components = 5 # test verbosity from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout sys.stdout = StringIO() dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1, random_state=0) dico.fit(X) dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2, random_state=0) dico.fit(X) dict_learning_online(X, n_components=n_components, alpha=1, verbose=1, random_state=0) dict_learning_online(X, n_components=n_components, alpha=1, verbose=2, random_state=0) sys.stdout = old_stdout assert_true(dico.components_.shape == (n_components, n_features))
def test_dict_learning_online_verbosity(): n_components = 5 # test verbosity from io import StringIO import sys old_stdout = sys.stdout try: sys.stdout = StringIO() dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1, random_state=0) dico.fit(X) dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2, random_state=0) dico.fit(X) dict_learning_online(X, n_components=n_components, alpha=1, verbose=1, random_state=0) dict_learning_online(X, n_components=n_components, alpha=1, verbose=2, random_state=0) finally: sys.stdout = old_stdout assert dico.components_.shape == (n_components, n_features)
def imageDenoisingTest01(): from time import time import matplotlib.pyplot as plt import numpy as np from scipy.misc import lena from sklearn.decomposition import MiniBatchDictionaryLearning from sklearn.feature_extraction.image import extract_patches_2d from sklearn.feature_extraction.image import reconstruct_from_patches_2d #Load image and extract patches lena = lena() / 256.0 lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2] lena /= 4.0 height, width = lena.shape #Distort the right half of the image print "distorting image" distorted = lena.copy() distorted[:, height//2:] += 0.075 * np.random.randn(width, height // 2) #plt.imshow(distorted[:, :height//2], cmap = plt.cm.gray, interpolation = "nearest") #plt.show() print "Extacting reference patches" #这里是从distorted的左半边抽取patches t0 = time() patch_size = (7, 7) data = extract_patches_2d(distorted[:, :height//2], patch_size) #data是 30500 * 7 * 7 维矩阵 #print data #print len(data) #print len(data[0][0]) #plt.imshow(data[0], cmap = plt.cm.gray, interpolation = "nearest") #plt.show() #print distorted[:, height//2:].shape #一半是256 * 128 #下面是把patch转换为一维向量, 然后再归一化 data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis = 0) data /= np.std(data, axis = 0) print 'done in ' + str(time() - t0) # Learn the dictionary from reference patches print "Learning the dictionary" t0 = time() #这一步是开始对patches进行学习 #new 一个model dico = MiniBatchDictionaryLearning(n_components = 100, alpha = 1, n_iter = 5000) print data.shape #data是30500 * 49维矩阵 V = dico.fit(data).components_ print V.shape #V是100 * 49维矩阵 dt = time() - t0 print "done in %.2fs." % dt plt.figure(figsize = (4.2, 4)) for i, comp in enumerate(V[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(patch_size), cmap = plt.cm.gray_r, interpolation = "nearest") plt.xticks(()) plt.yticks(()) plt.suptitle("Dictionary learned from lena patches\n" + "Train time %.1fs on %d patches" % (dt, len(data)), fontsize = 16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) def show_with_diff(image, reference, title): plt.figure(figsize = (5, 3.3)) plt.subplot(1, 2, 1) plt.title('Image') plt.imshow(image, vmin = 0, vmax = 1, cmap = plt.cm.gray, interpolation = "nearest") plt.xticks(()) plt.yticks(()) plt.subplot(1,2,2) difference = image - reference plt.title("difference (norm: %.2f)" % np.sqrt(np.sum(difference ** 2))) plt.imshow(difference, vmin = -0.5, vmax = 0.5, cmap = plt.cm.PuOr, interpolation = "nearest") plt.xticks(()) plt.yticks(()) plt.suptitle(title, size = 16) plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.02) show_with_diff(distorted, lena, "Distorted Image") #plt.show() #Extract noisy patches and reconstruct them using the dictionary #从右半边抽取patches print('Extracting noisy pathces...') t0 = time() data = extract_patches_2d(distorted[:, height//2:], patch_size) data = data.reshape(data.shape[0], -1) intercept = np.mean(data, axis = 0) data -= intercept print "done in %.2fs. " % (time() - t0) transform_algorithms = [('Orthogonal Matching Pursuit\n1 atom', 'omp', {'transform_n_nonzero_coefs': 1}), ('Orthogonal Matching Pursuit\n2 atoms', 'omp', {'transform_n_nonzero_coefs': 2}), ('Least-angle regression\n5 atoms', 'lars', {'transform_n_nonzero_coefs': 5}), ('Thresholding\n alpha = 0.1', 'threshold', {'transform_alpha': 0.1})] reconstructions = {} for title, transform_algorithm, kwargs in transform_algorithms: print title + "..." reconstructions[title] = lena.copy() t0 = time() dico.set_params(transform_algorithm = transform_algorithm, **kwargs) code = dico.transform(data) #利用之前训练的模型来获得代表系数 -- code patches = np.dot(code, V) if transform_algorithm == "threshold": patches -= patches.min() patches /= patches.max() patches += intercept patches = patches.reshape(len(data), *patch_size) if transform_algorithm == "threshold": patches -= patches.min() patches /= patches.max() reconstructions[title][:, height // 2:] = reconstruct_from_patches_2d(patches, (width, height // 2)) dt = time() - t0 print "done in %.2fs." % dt show_with_diff(reconstructions[title], lena, title + '(time: %.1fs)' % dt) plt.show()
def test_dict_learning_online_overcomplete(): n_components = 12 dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0).fit(X) assert dico.components_.shape == (n_components, n_features)
patches = patchify(img_train_gray, patch_size, step) initial_patch_size = patches.shape patches = patches.reshape(-1, patch_size[0] * patch_size[1]) patches_recto.append(patches) # Change the size of patches patches_recto = np.asarray(patches_recto) patches_recto = patches_recto.reshape(-1, m * m) # Do the normalisation here patches_recto -= np.mean(patches_recto, axis=0) # remove the mean patches_recto /= np.std(patches_recto, axis=0) # normalise each patch dico_recto = MiniBatchDictionaryLearning( n_components=dict_components, alpha=0.7, n_iter=400) #TODO:check with different parameters V_recto = dico_recto.fit(patches_recto).components_ """ # plot the dictionary plt.figure(figsize=(8, 6)) for i, comp in enumerate(V_recto[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.suptitle('Recto dictionary learned from patches') plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) """ print('Learning the dictionary for verso images...')
def test_dict_learning_online_estimator_shapes(): n_components = 5 dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0) dico.fit(X) assert_true(dico.components_.shape == (n_components, n_features))
class BoVWFeature(TransformerMixin): """ Extract BoVW Feature Parameters ---------- codebook_size : int the size of codebook, default:1000 method : str codebook's compute method , value: 'sc','km' """ def __init__(self, codebook_size=512, method='sc'): self.codebook_size = codebook_size self.method = method self.patch_num = 40000 self.patch_size = 8 self.sample = 'random' self.feature = 'raw' # raw, surf, hog def fit(self, X, y=None): # compute the codes print 'Extracting patchs...' patchs = [] num = self.patch_num // X.size for x in X: img = imread(str(x[0])) tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \ max_patches=num, random_state=np.random.RandomState()) patchs.append(tmp) data = np.vstack(patchs) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data = data/np.std(data, axis=0) print 'Learning codebook...' if self.method == 'sc': self.dico = MiniBatchDictionaryLearning(n_components=self.codebook_size, \ alpha=1, n_iter=100, batch_size =100, verbose=True) self.dico.fit(data) elif self.method=='km': # self.dico = MiniBatchKMeans(n_clusters=self.codebook_size) pass return self def transform(self, X): """ Parameters ---------- X : {array-like}, shape = [n_samples, 1] Training vectors, where n_samples is the number of samples and 1 is image path. Returns ------- array-like = [n_samples, features] Class labels predicted by each classifier. """ print 'Extracting feature...' # setting the dictionary self.dico.set_params(transform_algorithm='lars') results = [] for sample in X: img = imread(str(sample[0])) tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \ max_patches=300, random_state=np.random.RandomState()) data = tmp.reshape(tmp.shape[0], -1) data = data-np.mean(data, axis=0) data = data/np.std(data, axis=0) code = self.dico.transform(data) results.append(code.sum(axis=0)) return np.vstack(results) def get_params(self, deep=True): return {"codebook_size": self.codebook_size}
def test_dict_learning_online_verbosity(): # test verbosity for better coverage n_components = 5 from io import StringIO import sys old_stdout = sys.stdout try: sys.stdout = StringIO() # convergence monitoring verbosity dico = MiniBatchDictionaryLearning(n_components, batch_size=4, max_iter=5, verbose=1, tol=0.1, random_state=0) dico.fit(X) dico = MiniBatchDictionaryLearning( n_components, batch_size=4, max_iter=5, verbose=1, max_no_improvement=2, random_state=0, ) dico.fit(X) # higher verbosity level dico = MiniBatchDictionaryLearning(n_components, batch_size=4, max_iter=5, verbose=2, random_state=0) dico.fit(X) # function API verbosity dict_learning_online( X, n_components=n_components, batch_size=4, alpha=1, verbose=1, random_state=0, ) dict_learning_online( X, n_components=n_components, batch_size=4, alpha=1, verbose=2, random_state=0, ) finally: sys.stdout = old_stdout assert dico.components_.shape == (n_components, n_features)
return(data) print('extraction des patches') t0 = time() data = constructPatches(img2, patch_size, False) t1 = time() - t0 print('temps d\'extraction: %.2fs' % t1) print('construction du dictionnaire et fit sur les data') # il faut n_components > ncol des images # initialisation d'un dictionnaire # n_components: taille du dictionnaire # on fit le dictionnaire sur l'image de base normalisée t0 = time() dico = MiniBatchDictionaryLearning(n_components=2*img2.shape[1], alpha=1, n_iter=100) V = dico.fit(data).components_ t1 = time() - t0 print('temps fit dico: %.fs ' % t1) # définition des algos de transformations (OMP avec 1 et 2 atomes, LAR regression 5 atomes, et autre chose ) transform_algorithms = [('omp5', 'omp',{'transform_n_nonzero_coefs': 5})] #} création de plusieurs images reconstruites stockées dans un dictionnaire def reconstructImages(transform_algorithms): reconstructions = {} for title, transform_algorithm, kwargs in transform_algorithms: reconstructions[title] = img2.copy() dico.set_params(transform_algorithm=transform_algorithm, **kwargs) code = dico.transform(data)
#print("Patches before reshaping:", patches.shape) patches = patches.reshape(-1, patch_size[0] * patch_size[1]) #print("Patches after reshaping:", patches.shape) patches -= np.mean(patches, axis=0) patches /= np.std(patches, axis=0) print('done in %.2fs.' % (time() - t0)) print(patches.shape) # Learn the dictionary from reference patches print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning( n_components=400, alpha=0.5, n_iter=400) #TODO:check with different parameters V = dico.fit(patches).components_ dt = time() - t0 print('done in %.2fs.' % dt) plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(V[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.suptitle('Dictionary learned from patches\n' + 'Train time %.1fs on %d patches' % (dt, len(patches)),
def train(image_paths): results = np.zeros((atoms, lowPatchSize[0] * lowPatchSize[1] + highPatchSize[0] * highPatchSize[1])) init_dict = True for image_path in image_paths: # import image try: print(image_path) img = img_tl.importImage(image_path) except IOError: print( "Error, an image could not be found in the images directory. Did you move it while the machine was training?" ) sys.exit() img_low, img = img_tl.halfImageResolutionForTraining( img, lowPatchSize, highPatchSize) img = np.array(img) img_low = np.array(img_low) # for each colour channel in the image """ see if we can just train and make only 1 model.""" for channel in range(img.shape[2]): # convert to patches high_data = img[:, :, channel] low_data = img_low[:, :, channel] high_data = img_tl.convertImageDataToPatches( high_data, highPatchSize, 2) low_data = img_tl.convertImageDataToPatches(low_data, lowPatchSize) high_data_size = high_data.shape[1] low_data_size = low_data.shape[1] # mathematically reduce values to fit algorithm high_data *= 1 / math.sqrt(high_data_size) low_data *= 1 / math.sqrt(low_data_size) # join the high and low res data data = np.concatenate((high_data, low_data), axis=1) # train trainer = None if (init_dict): trainer = MiniBatchDictionaryLearning( n_components=atoms, alpha=lmbda * (1 / high_data_size + 1 / low_data_size), n_iter=iterations, n_jobs=-1, verbose=1) else: trainer = MiniBatchDictionaryLearning( n_components=atoms, alpha=lmbda * (1 / high_data_size + 1 / low_data_size), n_iter=iterations, n_jobs=-1, verbose=1, dict_init=results) model = trainer.fit(data).components_ results = model init_dict = False # save the result resultHigh = results[:, :highPatchSize[0] * highPatchSize[1]] resultLow = results[:, highPatchSize[0] * highPatchSize[1]:] np.save("models/sparseHigh.npy", resultHigh) np.save("models/sparseLow.npy", resultLow) plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(resultHigh[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(highPatchSize), cmap=plt.cm.gray_r, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.show() for i, comp in enumerate(resultLow[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(lowPatchSize), cmap=plt.cm.gray_r, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.show()
# So here UV, ||U||_1,1 and sum(||V_k||_2) are verified # instead of comparing directly U and V. assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol) assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol) assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol) # verify an obtained solution is not degenerate assert np.mean(U_64 != 0.0) > 0.05 assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0) @pytest.mark.parametrize( "estimator", [ SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning(batch_size=4, max_iter=10), ], ids=lambda x: x.__class__.__name__, ) def test_get_feature_names_out(estimator): """Check feature names for dict learning estimators.""" estimator.fit(X) n_components = X.shape[1] feature_names_out = estimator.get_feature_names_out() estimator_name = estimator.__class__.__name__.lower() assert_array_equal( feature_names_out, [f"{estimator_name}{i}" for i in range(n_components)], )
from sklearn.decomposition import MiniBatchDictionaryLearning, DictionaryLearning from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.preprocessing import normalize f = h5py.File(sys.argv[1],'r') features = [] V = [] X = f['data'] Y = f['label'] X = np.float64(X) print "learn the dictionary" # dico = DictionaryLearning(n_components=512, alpha=1, max_iter=20, verbose = 20) # dico.fit(X) dico = MiniBatchDictionaryLearning(n_components=512, alpha=1, batch_size = 32, n_iter=3000) for i in range(100): dico.partial_fit(X) print "epoch " +str(i) + " done" code = dico.transform(X) error = X - np.dot(code, dico.components_) print "error = ", np.sum(error) joblib.dump(dico, 'model/dico_model_batch_iter' + str(i) + '.pkl') #dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500, verbose = 20) #dico.fit(X) print "learn over" V = dico.components_ # f_out = h5py.File(sys.argv[2],'w') # f_out['dico'] = V # f_out.close() joblib.dump(dico, 'dico_model_batch.pkl')
for a in alpha_range: for n in n_range: if a > n: continue; it = 100; best_c = 100 # previously calculated through cross validation code for logisctic regression ## train_data if not os.path.isfile(fraud_data_path + 'train_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it)): print('## Creating train_data_sparse_a{:d}_c{:d}_it{:d}'.format(a,n,it)) print >> results_file, '## Creating train_data_sparse_a{:d}_c{:d}_it{:d}'.format(a,n,it) train_data_std = preprocessing.scale(train_data.values) train_data = pd.DataFrame(train_data_std, index=train_data.index.values) miniBatch = MiniBatchDictionaryLearning(n_components=n, alpha=a, n_iter=100) dictionary = miniBatch.fit(train_data.values).components_ dictionary_df = pd.DataFrame(dictionary) dictionary_df.to_csv(fraud_data_path + 'train_data_dictionary_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True) sparseCode = miniBatch.transform(train_data.values) sparseCode_df = pd.DataFrame(sparseCode, index=train_data.index.values) sparseCode_df.to_csv(fraud_data_path + 'train_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True) denoised = np.dot(sparseCode, dictionary) denoised_df = pd.DataFrame(denoised, index=train_data.index.values) denoised_df.to_csv(fraud_data_path + 'train_data_denoised_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True) ## test_data if not os.path.isfile(fraud_data_path + 'test_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it)): print('## Creating test_data_denoised_a{:d}_c{:d}_it{:d}'.format(a,n,it)) print >> results_file, '## Creating test_data_denoised_a{:d}_c{:d}_it{:d}'.format(a,n,it) test_data_std = preprocessing.scale(test_data.values)
def test_dict_learning_online_estimator_shapes(): n_components = 5 dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0) dico.fit(X) assert dico.components_.shape == (n_components, n_features)
# Extract all reference patches from the left half of the image print('Extracting reference patches...') t0 = time() patch_size = (7, 7) data = extract_patches_2d(distorted[:, :width // 2], patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) print('done in %.2fs.' % (time() - t0)) # ############################################################################# # Learn the dictionary from reference patches print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500) V = dico.fit(data).components_ dt = time() - t0 print('done in %.2fs.' % dt) plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(V[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.suptitle('Dictionary learned from face patches\n' + 'Train time %.1fs on %d patches' % (dt, len(data)), fontsize=16)
class SparseApproxSpectrum(object): def __init__(self, n_components=49, patch_size=(8,8), max_samples=1000000, **kwargs): self.omp = OrthogonalMatchingPursuit() self.n_components = n_components self.patch_size = patch_size self.max_samples = max_samples self.D = None self.data = None self.components = None self.standardize=False def _extract_data_patches(self, X): self.X = X data = extract_patches_2d(X, self.patch_size) data = data.reshape(data.shape[0], -1) if len(data)>self.max_samples: data = np.random.permutation(data)[:self.max_samples] print data.shape if self.standardize: self.mn = np.mean(data, axis=0) self.std = np.std(data, axis=0) data -= self.mn data /= self.std self.data = data def extract_codes(self, X, standardize=False): self.standardize=standardize self._extract_data_patches(X) self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, alpha=1, n_iter=500) print "Dictionary learning from data..." self.D = self.dico.fit(self.data) return self def plot_codes(self, cbar=False, **kwargs): #plt.figure(figsize=(4.2, 4)) N = int(np.ceil(np.sqrt(self.n_components))) kwargs.setdefault('cmap', pl.cm.gray_r) kwargs.setdefault('origin','bottom') kwargs.setdefault('interpolation','nearest') for i, comp in enumerate(self.D.components_): plt.subplot(N, N, i + 1) comp = comp * self.std + self.mn if self.standardize else comp plt.imshow(comp.reshape(self.patch_size), **kwargs) if cbar: plt.colorbar() plt.xticks(()) plt.yticks(()) plt.suptitle('Dictionary learned from spectrum patches\n', fontsize=16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23) def extract_audio_dir_codes(self, dir_expr='/home/mkc/exp/FMRI/stimuli/Wav6sRamp/*.wav',**kwargs): flist=glob.glob(dir_expr) self.X = np.vstack([feature_scale(LogFrequencySpectrum(f, nbpo=24, nhop=1024).X,normalize=1).T for f in flist]).T self.D = extract_codes(self.X, **kwargs) self.plot_codes(**kwargs) return self def _get_approximation_coefs(self,data, components): w = np.array([self.omp.fit(components.T, d.T).coef_ for d in data]) return w def reconstruct_spectrum(self, w=None, randomize=False): data = self.data components = self.D.components_ if w is None: self.w = self._get_approximation_coefs(data, components) w = self.w if self.standardize: for comp in components: comp = comp * self.std + self.mn if randomize: components = np.random.permutation(components) recon = np.dot(w, components).reshape(-1,self.patch_size[0],self.patch_size[1]) self.X_hat = reconstruct_from_patches_2d(recon, self.X.shape) return self def reconstruct_individual_spectra(self, w=None, randomize=False, plotting=False, **kwargs): self.reconstruct_spectrum(w,randomize) w, components = self.w, self.D.components_ self.X_hat_l = [] for i in range(len(self.w.T)): r=np.array((np.matrix(w)[:,i]*np.matrix(components)[i,:])).reshape(-1,self.patch_size[0],self.patch_size[1]) self.X_hat_l.append(reconstruct_from_patches_2d(r, self.X.shape)) if plotting: plt.figure() for k in range(self.n_components): plt.subplot(self.n_components**0.5,self.n_components**0.5,k+1) feature_plot(self.X_hat_l[k],nofig=1,**kwargs) return self
ax.set_title("Separation of Observations using " + algoName) #---------------------------------------------------------------------------------------------------- # Mini-batch dictionary learning from sklearn.decomposition import MiniBatchDictionaryLearning n_components = 50 alpha = 1 batch_size = 200 n_iter = 25 random_state = 2018 miniBatchDictLearning = MiniBatchDictionaryLearning( \ n_components=n_components, alpha=alpha, \ batch_size=batch_size, n_iter=n_iter, \ random_state=random_state) miniBatchDictLearning.fit(X_train.loc[:, :10000]) X_train_miniBatchDictLearning = miniBatchDictLearning.fit_transform(X_train) X_train_miniBatchDictLearning = pd.DataFrame( \ data=X_train_miniBatchDictLearning, index=train_index) X_validation_miniBatchDictLearning = \ miniBatchDictLearning.transform(X_validation) X_validation_miniBatchDictLearning = \ pd.DataFrame(data=X_validation_miniBatchDictLearning, \ index=validation_index) scatterPlot(X_train_miniBatchDictLearning, y_train, \ "Mini-batch Dictionary Learning")
def test_minibatch_dict_learning_wrong_params(param, match): # Check that error are raised with clear error message when wrong values # are passed for the parameters of MiniBatchDictionaryLearning with pytest.raises(ValueError, match=match): MiniBatchDictionaryLearning(**param).fit(X)
def fit(self, X=None, y=None): if self.patch_file is None: num = self.patch_num // X.size data = [] for item in X: img = imread(str(item[0])) img = img_as_ubyte(rgb2gray(img)) #img = self.binary(img) # 二值化 tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\ random_state=np.random.RandomState()) data.append(tmp) data = np.vstack(data) data = data.reshape(data.shape[0], -1) data = np.asarray(data, 'float32') else: data = np.load(self.patch_file,'r+') # load npy file, 注意模式,因为后面需要修改 data = np.require(data, dtype=np.float32) # Standardization #logging.info("Pre-processing : Standardization...") #self.standard = StandardScaler() #data = self.standard.fit_transform(data) # whiten #logging.info("Pre-processing : PCA Whiten...") #self.pca = RandomizedPCA(copy=True, whiten=True) #data = self.pca.fit_transform(data) # whiten logging.info("Pre-processing : ZCA Whiten...") self.zca = ZCA() data = self.zca.fit_transform(data) # 0-1 scaling 都可以用preprocessing模块实现 #self.minmax = MinMaxScaler() #data = self.minmax.fit_transform(data) """k-means self.kmeans = MiniBatchKMeans(n_clusters=self.n_components, init='k-means++', \ max_iter=self.n_iter, batch_size=self.batch_size, verbose=1,\ tol=0.0, max_no_improvement=100,\ init_size=None, n_init=3, random_state=np.random.RandomState(0),\ reassignment_ratio=0.0001) logging.info("Sparse coding : Phase 1 - Codebook learning (K-means).") self.kmeans.fit(data) logging.info("Sparse coding : Phase 2 - Define coding method (omp,lars...).") self.coder = SparseCoder(dictionary=self.kmeans.cluster_centers_, transform_n_nonzero_coefs=256, transform_alpha=None, transform_algorithm='lasso_lars', n_jobs = 1) """ #'''genertic logging.info("Sparse coding...") self.coder = MiniBatchDictionaryLearning(n_components=self.n_components, \ alpha=self.alpha, n_iter=self.n_iter, \ batch_size =self.batch_size, verbose=True) self.coder.fit(data) self.coder.transform_algorithm = 'omp' self.coder.transform_alpha = 0.1 # omp情况下,代表重建的误差 #''' return self
def codebook(self): self.mbdl = MiniBatchDictionaryLearning(self.N_codebook) self.mbdl.fit(self.raw_features)
class Sparsecode(BaseEstimator, TransformerMixin): def __init__(self, patch_file=None, patch_num=10000, patch_size=(16, 16),\ n_components=384, alpha = 1, n_iter=1000, batch_size=200): self.patch_num = patch_num self.patch_size = patch_size self.patch_file = patch_file self.n_components = n_components self.alpha = alpha #sparsity controlling parameter self.n_iter = n_iter self.batch_size = batch_size def fit(self, X=None, y=None): if self.patch_file is None: num = self.patch_num // X.size data = [] for item in X: img = imread(str(item[0])) img = img_as_ubyte(rgb2gray(img)) #img = self.binary(img) # 二值化 tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\ random_state=np.random.RandomState()) data.append(tmp) data = np.vstack(data) data = data.reshape(data.shape[0], -1) data = np.asarray(data, 'float32') else: data = np.load(self.patch_file,'r+') # load npy file, 注意模式,因为后面需要修改 data = np.require(data, dtype=np.float32) # Standardization #logging.info("Pre-processing : Standardization...") #self.standard = StandardScaler() #data = self.standard.fit_transform(data) # whiten #logging.info("Pre-processing : PCA Whiten...") #self.pca = RandomizedPCA(copy=True, whiten=True) #data = self.pca.fit_transform(data) # whiten logging.info("Pre-processing : ZCA Whiten...") self.zca = ZCA() data = self.zca.fit_transform(data) # 0-1 scaling 都可以用preprocessing模块实现 #self.minmax = MinMaxScaler() #data = self.minmax.fit_transform(data) """k-means self.kmeans = MiniBatchKMeans(n_clusters=self.n_components, init='k-means++', \ max_iter=self.n_iter, batch_size=self.batch_size, verbose=1,\ tol=0.0, max_no_improvement=100,\ init_size=None, n_init=3, random_state=np.random.RandomState(0),\ reassignment_ratio=0.0001) logging.info("Sparse coding : Phase 1 - Codebook learning (K-means).") self.kmeans.fit(data) logging.info("Sparse coding : Phase 2 - Define coding method (omp,lars...).") self.coder = SparseCoder(dictionary=self.kmeans.cluster_centers_, transform_n_nonzero_coefs=256, transform_alpha=None, transform_algorithm='lasso_lars', n_jobs = 1) """ #'''genertic logging.info("Sparse coding...") self.coder = MiniBatchDictionaryLearning(n_components=self.n_components, \ alpha=self.alpha, n_iter=self.n_iter, \ batch_size =self.batch_size, verbose=True) self.coder.fit(data) self.coder.transform_algorithm = 'omp' self.coder.transform_alpha = 0.1 # omp情况下,代表重建的误差 #''' return self def transform(self, X): #whiten #X_whiten = self.pca.transform(X) logging.info("Compute the sparse coding of X.") X = np.require(X, dtype=np.float32) #TODO: 是否一定需要先fit,才能transform #X = self.minmax.fit_transform(X) # -mean/std and whiten #X = self.standard.transform(X) #X = self.pca.transform(X) # ZCA X = self.zca.transform(X) # MiniBatchDictionaryLearning # return self.dico.transform(X_whiten) # k-means # TODO: sparse coder method? problem... return self.coder.transform(X) def get_params(self, deep=True): return {"patch_num": self.patch_num, "patch_size":self.patch_size, "alpha":self.alpha, "n_components":self.n_components, "n_iter":self.n_iter, "batch_size":self.batch_size} def set_params(self, **parameters): for parameter, value in parameters.items(): self.__setattr__(parameter, value) return self
def dictionary_learning_MHOF(flow_hist_H_400): from sklearn.decomposition import MiniBatchDictionaryLearning dico=MiniBatchDictionaryLearning(n_components=400,alpha=1,n_iter=500) dic=dico.fit(flow_hist_H_400).components_ #coeffs=dico.transform(flow_hist_H_400) return dic
# and (column permutated U*, row permutated V*) are also optional # as long as holding UV. # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified # instead of comparing directly U and V. assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol) assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol) assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol) # verify an obtained solution is not degenerate assert np.mean(U_64 != 0.0) > 0.05 assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0) @pytest.mark.parametrize( "estimator", [SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning()], ids=lambda x: x.__class__.__name__, ) def test_get_feature_names_out(estimator): """Check feature names for dict learning estimators.""" estimator.fit(X) n_components = X.shape[1] feature_names_out = estimator.get_feature_names_out() estimator_name = estimator.__class__.__name__.lower() assert_array_equal( feature_names_out, [f"{estimator_name}{i}" for i in range(n_components)], )
em.lparams = model_params em.run() dlog.close(True) pprint("Done") # ### Mini-Batch Dictionary Learning # # Alternative, since the EM library gives numerical errors # In[20]: from sklearn.decomposition import MiniBatchDictionaryLearning mbdic = MiniBatchDictionaryLearning(n_components=30,verbose=True) mbdic.fit(patches_flat) # ### Visualize the dictionary atoms # In[21]: V = mbdic.components_ plt.figure(figsize=(15,12)) for i,comp in enumerate(V): plt.subplot(10,10,i+1) plt.imshow(comp.reshape(patchsize).T,origin='lower',interpolation='nearest',aspect='auto',cmap='viridis')
# filter by a class if p.class_num is None: X = X_test Y = Y_test else: idxs = Y_test == p.class_num X = X_test[idxs] Y = Y_test[idxs] X_d = X.reshape(X.shape[0], -1) print(X_d.shape) n_iter = int(1000 / p.batch_size) dico = MiniBatchDictionaryLearning(n_components=p.num_bases, alpha=p.alpha, n_iter=n_iter, n_jobs=1, batch_size=p.batch_size) save_freq = 100 for i in tqdm(range(50000)): V = dico.fit(X_d) if i % save_freq == 0: s = '_alpha=' + str(p.alpha) + '_ncomps=' + str( p.num_bases) + '_class=' + str(p.class_num) fname1 = 'bases/bases_iters=' + str(i) + s + '.npy' np.save(fname1, V.components_) fname2 = 'bases/bases_iters=' + str(i - save_freq) + s + '.npy' viz_weights.plot_weights(V.components_, dset='rgb') fname3 = 'bases_figs/bases_iters=' + str(i - save_freq) + s + '.png' plt.savefig('bases_figs/bases_iters=' + str(i) + s + '.png', dpi=200,
# So here UV, ||U||_1,1 and sum(||V_k||_2) are verified # instead of comparing directly U and V. assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol) assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol) assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol) # verify an obtained solution is not degenerate assert np.mean(U_64 != 0.0) > 0.05 assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0) @pytest.mark.parametrize( "estimator", [ SparseCoder(X.T), DictionaryLearning(), MiniBatchDictionaryLearning(batch_size=4) ], ids=lambda x: x.__class__.__name__, ) def test_get_feature_names_out(estimator): """Check feature names for dict learning estimators.""" estimator.fit(X) n_components = X.shape[1] feature_names_out = estimator.get_feature_names_out() estimator_name = estimator.__class__.__name__.lower() assert_array_equal( feature_names_out, [f"{estimator_name}{i}" for i in range(n_components)], )
def run_dimension_reductions(): global mean for dataset in [Diabetes(), Adult()]: processor = Processor3() processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') pca = PCA(n_components=0.95) pca.fit(X_train) n_components = pca.components_.shape[0] print(f"n_components: {n_components}") whiten = True random_state = 0 dr_models = [ PCA(n_components=n_components, random_state=0), FastICA(n_components=n_components, random_state=0), MiniBatchDictionaryLearning(n_components=n_components, alpha=1, batch_size=200, n_iter=10, random_state=random_state), SparseRandomProjection(random_state=0, n_components=n_components) ] for pca in dr_models: X_train = pd.DataFrame(X_train) y_train = pd.DataFrame(y_train) if isinstance(pca, SparseRandomProjection): X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = np.array(X_train_PCA).dot( pca.components_.todense()) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) elif isinstance(pca, MiniBatchDictionaryLearning): X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = np.array(X_train_PCA).dot( pca.components_) + np.array(X_train.mean(axis=0)) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) else: X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = pca.inverse_transform(X_train_PCA) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) # plt.show() anomalyScoresPCA = anomalyScores(X_train, X_train_PCA_inverse) mean = np.mean(anomalyScoresPCA) print(mean) preds = plotResults(y_train, anomalyScoresPCA, True, pca.__class__.__name__, dataset.__class__.__name__, mean) processor.latex_end_figure( caption=f"{dataset.__class__.__name__} Precision-Recall Curve", fig=f"pr_{dataset.__class__.__name__}")
class Layer(object): def __init__(self, hierarchy, depth, patch_size, num_features, num_patches, multiplier): """ * depth - hierarchy level (1, 2, 3, etc.) * patch_size - number of pixels representing side of the square patch. like, 8 (8x8 patches) * num_features - how many components to learn * multiplier - num of subpatches we break patch into (0 for the first level). if 3, patch will contant 3x3 subpatches. """ self.hierarchy = hierarchy self.depth = depth self.basement_size = patch_size self.num_features = num_features self.num_patches = num_patches self.multiplier = multiplier self.learning = MiniBatchDictionaryLearning( n_components=num_features, n_iter=3000, transform_algorithm='lasso_lars', transform_alpha=0.5, n_jobs=2) self.ready = False def get_data(self, data, max_patches=None): """ Extracts raw data from patches. """ max_patches = max_patches or self.num_patches if isinstance(data, np.ndarray): # one image patches = extract_patches_2d( data, (self.basement_size, self.basement_size), max_patches=max_patches) else: patches = [] # multiple images for i in xrange(max_patches): idx = np.random.randint(len(data)) # selecting random image dx = dy = self.basement_size if data[idx].shape[0] <= dx or data[idx].shape[1] <= dy: continue x = np.random.randint(data[idx].shape[0] - dx) y = np.random.randint(data[idx].shape[1] - dy) patch = data[idx][x: x + dx, y: y + dy] patches.append(patch.reshape(-1)) patches = np.vstack(patches) patches = patches.reshape(patches.shape[0], self.basement_size, self.basement_size) print 'patches', patches.shape patches = preprocessing.scale(patches) return patches def learn(self, data): data = data.reshape(data.shape[0], -1) self.learning.fit(data) self.ready = True @property def output_size(self): return int(np.sqrt(self.num_features)) @property def input_size(self): if self.depth == 0: return self.basement_size else: prev_layer = self.hierarchy.layers[self.depth - 1] r = prev_layer.output_size * self.multiplier return r return self._input_size @property def features(self): return self.learning.components_ # def get_features(self): # # going from up to down # result = [] # layers = self.hierarchy.layers[: self.depth][::-1] # if self.depth == 0: # return self.features # previous_layer = self.hierarchy.layers[self.depth - 1] # for feature in self.features: # multiplier = self.multiplier # feature = feature.reshape(self.multiplier * previous_layer.output_size, # self.multiplier * previous_layer.output_size,) # for other_layer in layers: # expressed_feature = np.empty((multiplier * other_layer.input_size, # multiplier * other_layer.input_size)) # enc_n = other_layer.output_size # n = other_layer.input_size # for dx in range(multiplier): # for dy in range(multiplier): # encoded_subfeature = feature[dx * enc_n: (dx + 1) * enc_n, # dy * enc_n: (dy + 1) * enc_n] # prev_patch = np.dot(encoded_subfeature.reshape(-1), other_layer.features) # expressed_feature[dx * n: (dx + 1) * n, dy * n: (dy + 1) * n] = prev_patch.reshape(n, n) # feature = expressed_feature # multiplier *= other_layer.multiplier # result.append(expressed_feature.reshape(-1)) # result = np.vstack(result) # return result def get_features(self): # going from down to up. these two methods are look like the same if self.depth == 0: return self.features layers = self.hierarchy.layers[1: self.depth + 1] # down --> up features = self.hierarchy.layers[0].features # to express upper feature for i, layer in enumerate(layers, start=1): previous_layer = self.hierarchy.layers[i - 1] expressed_features = [] for feature in layer.features: n = previous_layer.output_size m = int(np.sqrt(features.shape[1])) feature = feature.reshape((layer.input_size, layer.input_size)) expressed_feature = np.empty((layer.multiplier * m, layer.multiplier * m)) for dx in range(layer.multiplier): for dy in range(layer.multiplier): subfeature = feature[dx * n: (dx + 1) * n, dy * n: (dy + 1) * n] # now that's previous_layer's code. replace it with reconstruction expressed_subfeature = np.dot(subfeature.reshape(-1), features) expressed_feature[dx * m: (dx + 1) * m, dy * m: (dy + 1) * m] = expressed_subfeature.reshape((m, m)) expressed_features.append(expressed_feature.reshape(-1)) features = np.vstack(expressed_features) return features
# Dimensionality reduction with PCA pca = PCA(n_components=49) # 47 changed to 49 for display purposes pca.fit(patches_l) patches_l = pca.transform(patches_l) ######################################################## ################## DICTIONARY LEARNING ################# ######################################################## # Dictionary learning on low resolution patches print('Learning the low resolution dictionary...') t0 = time.time() #dico = DictionaryLearning(n_components=1000, alpha=0.1, max_iter=40) dico = MiniBatchDictionaryLearning(n_components=1000, alpha=0.1, n_iter=40) # try with less iterations #np.save('C:/Users/Nikolina Mileva/Documents/Sparseland/dico.npy', dico) #V = dico.fit(patches_l).components_ #np.save('C:/Users/Nikolina Mileva/Documents/Sparseland/dictionary_l.npy', V) # Load and visualize the low resolution dictionary #V = np.load('C:/Users/Nikolina Mileva/Documents/Sparseland/dictionary_l.npy') #patch_size = (7,7) # patch size after PCA #plt.figure(figsize=(4.2, 4)) #for i, comp in enumerate(V[:100]): # plt.subplot(10, 10, i + 1) # plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, # interpolation='nearest') # plt.xticks(()) # plt.yticks(())
targets = target_int # dimensionality scaling #pca_feat = imagenet_features pca = PCA(n_components=np.size(features,1)) pca.fit(imagenet_features) pca_feat = pca.transform(imagenet_features) # Shufflinig ind = range(len(imagenet_targets)) np.random.shuffle(ind) imagenet_targets=imagenet_targets[ind] pca_feat=pca_feat[ind,:] # Dictionary Learning on Source dict_sparse = MiniBatchDictionaryLearning(alpha=1, n_components=sparse_components, verbose=3, batch_size=10, n_iter = 1000) dict_sparse.fit(pca_feat) Ds_0 = dict_sparse.components_ # Dictionary Learning on Target dict_sparse = DictionaryLearning(alpha=1, n_components=sparse_components, max_iter=3, verbose=3) dict_sparse.fit(features) Dt_0 = dict_sparse.components_ coder = SparseCoder(dictionary=Dt_0) Rt_0 = coder.transform(features) # Target Reconstruction Xt_1 = np.mat(Rt_0) * np.mat(Ds_0) dict_sparse = DictionaryLearning(alpha=1, n_components=sparse_components, max_iter=3, verbose=3) dict_sparse.fit(Xt_1) Dt_1 = dict_sparse.components_
ax.set_title("Separation of Observations using " + algoName) #--------------------------------------------------------------------------------------- # Mini-batch dictionary learning from sklearn.decomposition import MiniBatchDictionaryLearning n_components = 28 alpha = 1 batch_size = 200 n_iter = 10 random_state = 2018 miniBatchDictLearning = MiniBatchDictionaryLearning( \ n_components=n_components, alpha=alpha, batch_size=batch_size, \ n_iter=n_iter, random_state=random_state) miniBatchDictLearning.fit(X_train) X_test_miniBatchDictLearning = miniBatchDictLearning.transform(X_test) X_test_miniBatchDictLearning = \ pd.DataFrame(data=X_test_miniBatchDictLearning, index=X_test.index) scatterPlot(X_test_miniBatchDictLearning, y_test, \ "Mini-batch Dictionary Learning") plt.show() # 再構成 X_test_miniBatchDictLearning_inverse = \ np.array(X_test_miniBatchDictLearning). \ dot(miniBatchDictLearning.components_)
# Extract all reference patches from the left half of the image print('Extracting reference patches...') t0 = time() patch_size = (7, 7) data = extract_patches_2d(distorted[:, :height // 2], patch_size) data = data.reshape(data.shape[0], -1) data -= np.mean(data, axis=0) data /= np.std(data, axis=0) print('done in %.2fs.' % (time() - t0)) ############################################################################### # Learn the dictionary from reference patches print('Learning the dictionary...') t0 = time() dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500) V = dico.fit(data).components_ dt = time() - t0 print('done in %.2fs.' % dt) plt.figure(figsize=(4.2, 4)) for i, comp in enumerate(V[:100]): plt.subplot(10, 10, i + 1) plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r, interpolation='nearest') plt.xticks(()) plt.yticks(()) plt.suptitle('Dictionary learned from Lena patches\n' + 'Train time %.1fs on %d patches' % (dt, len(data)), fontsize=16) plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
def learn_dictionary_and_encode(data, n_atoms=20, alpha=0.5, n_iter=200, random_seed=42, n_jobs=1, fit_algorithm='cd', transform_algorithm='lasso_cd'): r""" Will learn a dictionary for the data (row-wise) and encode it, with the specified parameters. Returns the dictionary, components as a dataframe, and encoded data. By default, allows 20 words with alpha of 0.5. More info at : https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchDictionaryLearning.html >>> from pygtftk.stats.intersect.modl.dict_learning import test_data_for_modl >>> from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode >>> import numpy as np >>> np.random.seed(42) >>> flags_matrix = test_data_for_modl(nflags = 1000, number_of_sets = 6) >>> import time >>> start_time = time.time() >>> U_df, V_df, error = learn_dictionary_and_encode(flags_matrix, n_atoms = 20, alpha = 0.5) >>> stop_time = time.time() """ # TODO: Many operations here, such as recreating an object of a pandas # dataframe, might not be necessary ? data = np.array(data) # Force cast as array # Cannot ask for more rows than there are unique lines n_atoms = min(n_atoms, len(np.unique(data, axis=0))) dico = MiniBatchDictionaryLearning(n_components=n_atoms, n_iter=n_iter, alpha=alpha, fit_algorithm=fit_algorithm, transform_algorithm=transform_algorithm, transform_alpha=alpha, positive_dict=True, positive_code=True, random_state=random_seed, n_jobs=n_jobs) # NOTE fit_algorithm is used during the learning and transform_algorithm # transforms the data once the estimator has been fitted. # We are using coordinate descent (CD) as LARS has troubles with correlated # features. Also because we want to be able to enforce positivity in the # dictionary and the code for interpretability. dico.fit(data) # Fit the data # Get components (the dictionary). # NOTE: Use a try-except for future proofing, as sklearn (v 0.24) seems to # deprecate 'components_' across the board and replaced it with 'dictionary' try: V = dico.components_ except: V = dico.dictionary V_df = pd.DataFrame(V) # If the alpha is inadapted, the learned dictionary may have failed to converge # and contain NaNs. If that happens, return it now and bypass the next steps # NOTE LARS is less vulnerable to it, use it as a fallback before calling it quits. if np.isnan(V).any(): message( "Learned dictionary by Coordinate descent contained NaNs. Alpha may have been indadapted. Defaulting to LARS.", type='DEBUG') dico = MiniBatchDictionaryLearning(n_components=n_atoms, n_iter=n_iter, alpha=alpha, transform_alpha=alpha, random_state=random_seed, n_jobs=n_jobs, fit_algorithm='lars', transform_algorithm='lasso_lars') dico.fit(data) try: V = dico.components_ except: V = dico.dictionary V_df = pd.DataFrame(V) # Only abort if it still does not work if np.isnan(V).any(): message("Fallback still contains NaNs. Aborting.", type='DEBUG') return None, V_df, None # Return "U", V, "error" # Re-encode the data with this dictionary encoded_data = dico.transform(data) ed_df = pd.DataFrame(encoded_data) # Compute associated normalized L2 loss for reference reconstructed_features = np.matmul(encoded_data, V) error = np.sum((reconstructed_features - data)**2) / np.sum(data**2) return ed_df, V_df, error # Return U, V, error