def test_dict_learning_online_positivity(transform_algorithm,
                                         positive_code,
                                         positive_dict):
    rng = np.random.RandomState(0)
    n_components = 8

    dico = MiniBatchDictionaryLearning(
        n_components, transform_algorithm=transform_algorithm, random_state=0,
        positive_code=positive_code, positive_dict=positive_dict).fit(X)
    code = dico.transform(X)
    if positive_dict:
        assert_true((dico.components_ >= 0).all())
    else:
        assert_true((dico.components_ < 0).any())
    if positive_code:
        assert_true((code >= 0).all())
    else:
        assert_true((code < 0).any())

    code, dictionary = dict_learning_online(X, n_components=n_components,
                                            alpha=1, random_state=rng,
                                            positive_dict=positive_dict,
                                            positive_code=positive_code)
    if positive_dict:
        assert_true((dictionary >= 0).all())
    else:
        assert_true((dictionary < 0).any())
    if positive_code:
        assert_true((code >= 0).all())
    else:
        assert_true((code < 0).any())
def dictionay_learning_MHOF_online(training_samples_num=400):
    from MHOF_Extraction import MHOF_Extraction
    from MHOF_histogram_block import MHOF_histogram_block
    from sklearn.decomposition import MiniBatchDictionaryLearning
    import numpy as np
    import cv2
    import video
    cam=video.create_capture('Crowd-Activity-All.avi')
    height_block_num=4
    width_block_num=5
    bin_num=16
    ret,prev=cam.read()
    ret,img=cam.read()
    flow_H=MHOF_Extraction(prev,img)
    flow_hist_H=MHOF_histogram_block(flow_H,height_block_num,width_block_num,bin_num)
    flow_hist_H=np.reshape(flow_hist_H,[1,flow_hist_H.size])
    #  error!!!!
    dico=MiniBatchDictionaryLearning(1,alpha=1,n_iter=500)
    dic=dico.fit(flow_hist_H).components_
    for i in range(training_samples_num):
        ret,img=cam.read()
        flow_H=MHOF_Extraction(prev,img)
        flow_hist_H=MHOF_histogram_block(flow_H,height_block_num,width_block_num,bin_num)
        dico=MiniBatchDictionaryLearing(i+1,alpha=1,n_iter=500,dict_init=dic)
        dic=dico.fit(flow_hist_H).components
    return dic

        
def main(games_path = None):
    
    if games_path == None:
        games_path = 'specmine/data/go_games/2010-01.pickle.gz'

    with specmine.util.openz(games_path) as games_file:
        games = pickle.load(games_file)

    boards = None # numpy array nx9x9 
    for game in games:
        if boards == None: 
            boards = games[game].grids
        else:
            boards = numpy.vstack((boards,games[game].grids))

    print 'boards shape: ', boards.shape

    boards = boards.reshape((boards.shape[0],-1))

    print 'boards reshaped: ', boards.shape

    print 'Learning the dictionary... '
    t0 = time()
    dico = MiniBatchDictionaryLearning(n_atoms=100, alpha=1, n_iter=500)
    V = dico.fit(boards).components_
    dt = time() - t0
    print 'done in %.2fs.' % dt

    #pl.figure(figsize=(4.2, 4))
    for i, comp in enumerate(V[:100]):
        pl.subplot(10, 10, i + 1)
        pl.imshow(comp, cmap=pl.cm.gray_r) # interpolation='nearest')
        pl.xticks(())
        pl.yticks(())
Beispiel #4
0
def scskl_dico_learning(list_pickled_array,n_atoms,maxepoch=5,maxiter=100):
  D = None
  for e in range(maxepoch):
    for a in list_pickled_array:
      data = joblib.load(a)
      dico = MiniBatchDictionaryLearning(n_components=n_atoms, n_iter=maxiter, dict_init=D)
      D = dico.fit(data).components_.astype(np.float32)
  return D      
Beispiel #5
0
def sklearn_check(img, patch_size, dic_size, T=1000):
    patch_shape = (patch_size, patch_size)
    patches = extract_patches_2d(img, patch_shape)
    patches = patches.reshape(patches.shape[0], -1)
    patches = center(patches)
    dl = MiniBatchDictionaryLearning(dic_size, n_iter=T)
    dl.fit(patches)
    D = dl.components_.T
    return D
def to_sparse(X,dim):

	sparse_dict = MiniBatchDictionaryLearning(dim)
	sparse_dict.fit(X)
	sparse_vectors = sparse_encode(X, sparse_dict.components_)

	for i in sparse_vectors:
		print i

	return sparse_vectors
def create_dictionaries(n_codewords=20):
	dataset_features = np.load('MSR_Features_hog-hof-skel1360423760.27.dat')
	hogs = []
	hofs = []
	skels = []
	for n in dataset_features.keys():
		hogs +=	dataset_features[n]['hog']
		hofs +=	dataset_features[n]['hof']
		skels += [normalize_skeleton(dataset_features[n]['skel_world'])]

	''' Input should be features[n_samples, n_features] '''
	hogs = np.vstack(hogs)
	hofs = np.vstack(hofs)
	skels = np.vstack(skels)

	hog_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars')
	hog_dict.fit(hogs)
	hof_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars')
	hof_dict.fit(hofs)
	skels_dict = MiniBatchDictionaryLearning(n_codewords, n_jobs=-1, verbose=True, transform_algorithm='lasso_lars')
	skels_dict.fit(skels)

	feature_dictionaries = {'hog':hog_dict, 'hof':hof_dict, 'skel':skels_dict}

	with open('MSR_Dictionaries_hog-hof-skel_%f.dat'%time.time(), 'wb') as outfile:
	    pickle.dump(feature_dictionaries, outfile, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #8
0
class BOW_sparsecoding(BOW):

	def codebook(self):
		self.mbdl =  MiniBatchDictionaryLearning(self.N_codebook)
		self.mbdl.fit(self.raw_features)
		

	def bow_feature_extract(self, path):
		des = self.raw_feature_extract(path)
		out = sum(sparse_encode(des, self.mbdl.components_))
		out = np.array([out])
		return out
Beispiel #9
0
def buildmodel2():
    "生成有眼镜-无眼镜pair模型"
    modelrec = np.load('cut_rec.npy')
    modelglass = np.load('glassline.npy')[:modelrec.shape[0]]

    linkedmodel = np.empty((modelrec.shape[0],modelrec.shape[1]+modelglass.shape[1]),'f')
    linkedmodel[:,:modelrec.shape[1]]=modelrec
    linkedmodel[:,modelrec.shape[1]:]=modelglass

    #Train
    from sklearn.decomposition import MiniBatchDictionaryLearning
    learning = MiniBatchDictionaryLearning(500,verbose=True)
    learning.fit(linkedmodel)
    import cPickle
    cPickle.dump(learning,file('sparselinked','wb'),-1)
Beispiel #10
0
 def extract_codes(self, X, standardize=False):
     self.standardize=standardize
     self._extract_data_patches(X)
     self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, alpha=1, n_iter=500)
     print "Dictionary learning from data..."
     self.D = self.dico.fit(self.data)
     return self
Beispiel #11
0
 def fit(self, X, y=None):
     # compute the codes
     print 'Extracting patchs...'
     patchs = []
     num = self.patch_num // X.size
     for x in X:
         img = imread(str(x[0]))
         tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \
                                  max_patches=num, random_state=np.random.RandomState())
         patchs.append(tmp)
     data = np.vstack(patchs)
     data = data.reshape(data.shape[0], -1)
     
     data -= np.mean(data, axis=0)
     data = data/np.std(data, axis=0)
     
     print 'Learning codebook...'
     if self.method == 'sc':
         self.dico = MiniBatchDictionaryLearning(n_components=self.codebook_size, \
                                            alpha=1, n_iter=100, batch_size =100, verbose=True)
         self.dico.fit(data)
     elif self.method=='km':
         # self.dico = MiniBatchKMeans(n_clusters=self.codebook_size)
         pass
     
     return self
def learning_sparse_coding(X, components=None):
    """
    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.DictionaryLearning.html
    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.sparse_encode.html
    """
    if components is None:
        print('Learning the dictionary...')
        t0 = time()
        diclearner = MiniBatchDictionaryLearning(n_components=100, verbose=True)
        components = diclearner.fit(X).components_
        np.savetxt('components_of_convfeat.txt', components)
        dt = time() - t0
        print('done in %.2fs.' % dt)

    codes = sparse_encode(X, components)
    np.savetxt('sparse_codes_of_convfeat.txt', codes)
def test_dict_learning_online_partial_fit():
    # this test was not actually passing before!
    raise SkipTest
    n_components = 12
    rng = np.random.RandomState(0)
    V = rng.randn(n_components, n_features)  # random init
    V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
    dico1 = MiniBatchDictionaryLearning(n_components, n_iter=10, batch_size=1,
                                        shuffle=False, dict_init=V,
                                        random_state=0).fit(X)
    dico2 = MiniBatchDictionaryLearning(n_components, n_iter=1, dict_init=V,
                                        random_state=0)
    for ii, sample in enumerate(X):
        dico2.partial_fit(sample, iter_offset=ii * dico2.n_iter)
        # if ii == 1: break
    assert_true(not np.all(sparse_encode(X, dico1.components_, alpha=100) ==
                           0))
    assert_array_equal(dico1.components_, dico2.components_)
def train_sparse_coding(feature_list, patch_list, dict_size=256, transform_alpha=0.5, n_iter=50):
    """
    使用mini batch训练稀疏编码
    #feature_list 表示要训练的特征的列表
    #patch_list 表示结果patch的列表

    :return sc_list
    """
    sc_list = []
    i = 0
    for feature, patch in zip(feature_list, patch_list):
        i = i + 1
        '''
        由于组合数值大小比例的问题,稀疏编码可能忽略较小的特征,下面的×10需要用别的特征归一化方法代替
        相关性越大,则每个向量都是有用的,所以需要更长的时间进行训练。
        '''
        dico = None
        X = np.concatenate((feature, patch), axis=1)

        if len(X) > 100000:
            np.random.shuffle(X)
            X = X[:90000]

        if len(X) < 5000:
            print "进入DictionaryLearning状态"
            dico = MiniBatchDictionaryLearning(batch_size=1000, transform_algorithm='lasso_lars', fit_algorithm='lars',
                                               transform_n_nonzero_coefs=5, n_components=len(X)/50,
                                               dict_init=X[:len(X)/50],
                                               n_iter=n_iter, transform_alpha=transform_alpha, verbose=10, n_jobs=-1)
        else:
            print "进入MiniBatchDictionaryLearning状态"
            dico = MiniBatchDictionaryLearning(batch_size=1000, transform_algorithm='lasso_lars', fit_algorithm='lars',
                                               transform_n_nonzero_coefs=5, n_components=len(X)/50,
                                               dict_init=X[:len(X)/50],
                                               n_iter=n_iter, transform_alpha=transform_alpha, verbose=10, n_jobs=-1)
        V = dico.fit(X).components_
        sc_list.append(V)

        file_name = "./tmp_file/_tmp_sc_list_new_clsd_raw_%d.pickle" % (i)
        sc_file = open(file_name, 'wb')
        cPickle.dump(sc_list, sc_file, 1)
        sc_file.close()

    return sc_list
def test_dict_learning_online_partial_fit():
    n_components = 12
    rng = np.random.RandomState(0)
    V = rng.randn(n_components, n_features)  # random init
    V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
    dict1 = MiniBatchDictionaryLearning(n_components, n_iter=10 * len(X),
                                        batch_size=1,
                                        alpha=1, shuffle=False, dict_init=V,
                                        random_state=0).fit(X)
    dict2 = MiniBatchDictionaryLearning(n_components, alpha=1,
                                        n_iter=1, dict_init=V,
                                        random_state=0)
    for i in range(10):
        for sample in X:
            dict2.partial_fit(sample[np.newaxis, :])

    assert not np.all(sparse_encode(X, dict1.components_, alpha=1) == 0)
    assert_array_almost_equal(dict1.components_, dict2.components_,
                              decimal=2)
 def extract_codes(self, X, n_components=16, zscore=True, log_amplitude=True, **mbl_args):
 	"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
     inputs:
         X - spectrogram data (frequency x time)
 	    n_components - how many components to extract [16]
         zscore - whether to zscore the ensemble of 2D patches [True]
         log_amplitude - whether to apply log(1+X) scaling of spectrogram data [True]
         **mbl_args - keyword arguments for MiniBatchDictionaryLearning.fit(...) [None]
     outputs:
         self.data - 2D patches of input spectrogram
         self.D.components_ - dictionary of learned 2D atoms for sparse coding
     """
     self._extract_data_patches(X, zscore, log_amplitude)
     self.n_components = n_components
     self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, **mbl_args)
     print "Dictionary learning from data..."
     self.D = self.dico.fit(self.data)
Beispiel #17
0
 def __init__(self, hierarchy, depth, patch_size, num_features, num_patches, multiplier):
     """
      * depth - hierarchy level (1, 2, 3, etc.)
      * patch_size - number of pixels representing side of the square patch.
        like, 8 (8x8 patches)
      * num_features - how many components to learn
      * multiplier - num of subpatches we break patch into
        (0 for the first level). if 3, patch will contant 3x3 subpatches.
     """
     self.hierarchy = hierarchy
     self.depth = depth
     self.basement_size = patch_size
     self.num_features = num_features
     self.num_patches = num_patches
     self.multiplier = multiplier
     self.learning = MiniBatchDictionaryLearning(
         n_components=num_features, n_iter=3000, transform_algorithm='lasso_lars', transform_alpha=0.5, n_jobs=2)
     self.ready = False
def test_dict_learning_online_verbosity():
    n_components = 5
    # test verbosity
    from cStringIO import StringIO
    import sys
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1)
    dico.fit(X)
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2)
    dico.fit(X)
    dict_learning_online(X, n_components=n_components, alpha=1, verbose=1)
    dict_learning_online(X, n_components=n_components, alpha=1, verbose=2)
    sys.stdout = old_stdout
    assert_true(dico.components_.shape == (n_components, n_features))
def test_dict_learning_online_verbosity():
    n_components = 5
    # test verbosity
    from sklearn.externals.six.moves import cStringIO as StringIO
    import sys
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1,
                                       random_state=0)
    dico.fit(X)
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2,
                                       random_state=0)
    dico.fit(X)
    dict_learning_online(X, n_components=n_components, alpha=1, verbose=1,
                         random_state=0)
    dict_learning_online(X, n_components=n_components, alpha=1, verbose=2,
                         random_state=0)
    sys.stdout = old_stdout
    assert_true(dico.components_.shape == (n_components, n_features))
def test_dict_learning_online_verbosity():
    n_components = 5
    # test verbosity
    from io import StringIO
    import sys

    old_stdout = sys.stdout
    try:
        sys.stdout = StringIO()
        dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1,
                                           random_state=0)
        dico.fit(X)
        dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2,
                                           random_state=0)
        dico.fit(X)
        dict_learning_online(X, n_components=n_components, alpha=1, verbose=1,
                             random_state=0)
        dict_learning_online(X, n_components=n_components, alpha=1, verbose=2,
                             random_state=0)
    finally:
        sys.stdout = old_stdout

    assert dico.components_.shape == (n_components, n_features)
def imageDenoisingTest01():
	from time import time
	import matplotlib.pyplot as plt
	import numpy as np

	from scipy.misc import lena

	from sklearn.decomposition import MiniBatchDictionaryLearning
	from sklearn.feature_extraction.image import extract_patches_2d
	from sklearn.feature_extraction.image import reconstruct_from_patches_2d

	#Load image and extract patches
	lena = lena() / 256.0




	lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
	lena /= 4.0

	height, width = lena.shape

	#Distort the right half of the image
	print "distorting image"

	distorted = lena.copy()
	distorted[:, height//2:] += 0.075 * np.random.randn(width, height // 2)

	#plt.imshow(distorted[:, :height//2], cmap = plt.cm.gray, interpolation = "nearest")
	#plt.show()

	print "Extacting reference patches"
	#这里是从distorted的左半边抽取patches
	t0 = time()
	patch_size = (7, 7)
	data = extract_patches_2d(distorted[:, :height//2], patch_size)

	#data是 30500 * 7 * 7 维矩阵
	#print data
	#print len(data)
	#print len(data[0][0])

	#plt.imshow(data[0], cmap = plt.cm.gray, interpolation = "nearest")
	#plt.show()

	#print distorted[:, height//2:].shape #一半是256 * 128




	#下面是把patch转换为一维向量, 然后再归一化
	data = data.reshape(data.shape[0], -1)
	data -= np.mean(data, axis = 0)
	data /= np.std(data, axis = 0)

	print 'done in ' + str(time() - t0)


	# Learn the dictionary from reference patches
	print "Learning the dictionary"
	t0 = time()
	#这一步是开始对patches进行学习
	#new 一个model
	dico = MiniBatchDictionaryLearning(n_components = 100, alpha = 1, n_iter = 5000)

	print data.shape  #data是30500 * 49维矩阵
	V = dico.fit(data).components_

	print V.shape #V是100 * 49维矩阵
	dt = time() - t0

	print "done in %.2fs." % dt

	plt.figure(figsize = (4.2, 4))
	for i, comp in enumerate(V[:100]):
		plt.subplot(10, 10, i + 1)
		plt.imshow(comp.reshape(patch_size), cmap = plt.cm.gray_r, interpolation = "nearest")
		plt.xticks(())
		plt.yticks(())

	plt.suptitle("Dictionary learned from lena patches\n" + "Train time %.1fs on %d patches" % (dt, len(data)), fontsize = 16)

	plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

	def show_with_diff(image, reference, title):
		plt.figure(figsize = (5, 3.3))
		plt.subplot(1, 2, 1)
		plt.title('Image')
		plt.imshow(image, vmin = 0, vmax = 1, cmap = plt.cm.gray, interpolation = "nearest")

		plt.xticks(())
		plt.yticks(())
		plt.subplot(1,2,2)

		difference = image - reference

		plt.title("difference (norm: %.2f)" % np.sqrt(np.sum(difference ** 2)))

		plt.imshow(difference, vmin = -0.5, vmax = 0.5, cmap = plt.cm.PuOr, interpolation = "nearest")
		plt.xticks(())
		plt.yticks(())
		plt.suptitle(title, size = 16)

		plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.02)


	show_with_diff(distorted, lena, "Distorted Image")




	#plt.show()

	#Extract noisy patches and reconstruct them using the dictionary
	#从右半边抽取patches
	print('Extracting noisy pathces...')
	t0 = time()
	data = extract_patches_2d(distorted[:, height//2:], patch_size)
	data = data.reshape(data.shape[0], -1)
	intercept = np.mean(data, axis = 0)
	data -= intercept

	print "done in %.2fs. " % (time() - t0)

	transform_algorithms = [('Orthogonal Matching Pursuit\n1 atom', 'omp',
							{'transform_n_nonzero_coefs': 1}),
							('Orthogonal Matching Pursuit\n2 atoms', 'omp',
							{'transform_n_nonzero_coefs': 2}),
							('Least-angle regression\n5 atoms', 'lars',
							{'transform_n_nonzero_coefs': 5}),
							('Thresholding\n alpha = 0.1', 'threshold',
							{'transform_alpha': 0.1})]

	reconstructions = {}
	for title, transform_algorithm, kwargs in transform_algorithms:
		print title + "..."
		reconstructions[title] = lena.copy()
		t0 = time()
		dico.set_params(transform_algorithm = transform_algorithm, **kwargs)
		code = dico.transform(data) #利用之前训练的模型来获得代表系数 -- code
		patches = np.dot(code, V)

		if transform_algorithm == "threshold":
			patches -= patches.min()
			patches /= patches.max()

		patches += intercept
		patches = patches.reshape(len(data), *patch_size)

		if transform_algorithm == "threshold":
			patches -= patches.min()
			patches /= patches.max()

		reconstructions[title][:, height // 2:] = reconstruct_from_patches_2d(patches, (width, height // 2))
		dt = time() - t0
		print "done in %.2fs." % dt
		show_with_diff(reconstructions[title], lena, title + '(time: %.1fs)' % dt)

	plt.show()
Beispiel #22
0
def test_dict_learning_online_overcomplete():
    n_components = 12
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20,
                                       random_state=0).fit(X)
    assert dico.components_.shape == (n_components, n_features)
    patches = patchify(img_train_gray, patch_size, step)
    initial_patch_size = patches.shape
    patches = patches.reshape(-1, patch_size[0] * patch_size[1])

    patches_recto.append(patches)

# Change the size of patches
patches_recto = np.asarray(patches_recto)
patches_recto = patches_recto.reshape(-1, m * m)
# Do the normalisation here
patches_recto -= np.mean(patches_recto, axis=0)  # remove the mean
patches_recto /= np.std(patches_recto, axis=0)  # normalise each patch

dico_recto = MiniBatchDictionaryLearning(
    n_components=dict_components, alpha=0.7,
    n_iter=400)  #TODO:check with different parameters
V_recto = dico_recto.fit(patches_recto).components_
"""
# plot the dictionary
plt.figure(figsize=(8, 6))
for i, comp in enumerate(V_recto[:100]):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
plt.suptitle('Recto dictionary learned from patches')
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
"""

print('Learning the dictionary for verso images...')
def test_dict_learning_online_estimator_shapes():
    n_components = 5
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0)
    dico.fit(X)
    assert_true(dico.components_.shape == (n_components, n_features))
Beispiel #25
0
class BoVWFeature(TransformerMixin):
    """ 
    Extract BoVW Feature
        
    Parameters
    ----------
    codebook_size : int
      the size of codebook, default:1000
    
    method : str
      codebook's compute method , value: 'sc','km'
      
    """
    def __init__(self, codebook_size=512, method='sc'):
        self.codebook_size = codebook_size
        self.method = method
        self.patch_num = 40000
        self.patch_size = 8
        self.sample = 'random'
        self.feature = 'raw' # raw, surf, hog

    
    def fit(self, X, y=None):
        # compute the codes
        print 'Extracting patchs...'
        patchs = []
        num = self.patch_num // X.size
        for x in X:
            img = imread(str(x[0]))
            tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \
                                     max_patches=num, random_state=np.random.RandomState())
            patchs.append(tmp)
        data = np.vstack(patchs)
        data = data.reshape(data.shape[0], -1)
        
        data -= np.mean(data, axis=0)
        data = data/np.std(data, axis=0)
        
        print 'Learning codebook...'
        if self.method == 'sc':
            self.dico = MiniBatchDictionaryLearning(n_components=self.codebook_size, \
                                               alpha=1, n_iter=100, batch_size =100, verbose=True)
            self.dico.fit(data)
        elif self.method=='km':
            # self.dico = MiniBatchKMeans(n_clusters=self.codebook_size)
            pass
        
        return self
    
    def transform(self, X):
        """         
        Parameters
        ----------
        X : {array-like}, shape = [n_samples, 1]
            Training vectors, where n_samples is the number of samples and
            1 is image path.
      
        Returns
        -------

          array-like = [n_samples, features]
            Class labels predicted by each classifier.
        
        """
        print 'Extracting feature...'
        # setting the dictionary
        self.dico.set_params(transform_algorithm='lars')
        results = []
        for sample in X:
            img = imread(str(sample[0]))
            tmp = extract_patches_2d(img, (self.patch_size,self.patch_size), \
                                     max_patches=300, random_state=np.random.RandomState())
            data = tmp.reshape(tmp.shape[0], -1)
            data = data-np.mean(data, axis=0)
            data = data/np.std(data, axis=0)
            code = self.dico.transform(data)
            results.append(code.sum(axis=0))
        return np.vstack(results)
    
    def get_params(self, deep=True):
        return {"codebook_size": self.codebook_size}
Beispiel #26
0
def test_dict_learning_online_verbosity():
    # test verbosity for better coverage
    n_components = 5
    from io import StringIO
    import sys

    old_stdout = sys.stdout
    try:
        sys.stdout = StringIO()

        # convergence monitoring verbosity
        dico = MiniBatchDictionaryLearning(n_components,
                                           batch_size=4,
                                           max_iter=5,
                                           verbose=1,
                                           tol=0.1,
                                           random_state=0)
        dico.fit(X)
        dico = MiniBatchDictionaryLearning(
            n_components,
            batch_size=4,
            max_iter=5,
            verbose=1,
            max_no_improvement=2,
            random_state=0,
        )
        dico.fit(X)
        # higher verbosity level
        dico = MiniBatchDictionaryLearning(n_components,
                                           batch_size=4,
                                           max_iter=5,
                                           verbose=2,
                                           random_state=0)
        dico.fit(X)

        # function API verbosity
        dict_learning_online(
            X,
            n_components=n_components,
            batch_size=4,
            alpha=1,
            verbose=1,
            random_state=0,
        )
        dict_learning_online(
            X,
            n_components=n_components,
            batch_size=4,
            alpha=1,
            verbose=2,
            random_state=0,
        )
    finally:
        sys.stdout = old_stdout

    assert dico.components_.shape == (n_components, n_features)
    
    return(data)

print('extraction des patches')
t0 = time()
data = constructPatches(img2, patch_size, False)
t1 = time() - t0
print('temps d\'extraction: %.2fs' % t1)

print('construction du dictionnaire et fit sur les data')
# il faut n_components > ncol des images 
# initialisation d'un dictionnaire
# n_components: taille du dictionnaire
# on fit le dictionnaire sur l'image de base normalisée
t0 = time()
dico = MiniBatchDictionaryLearning(n_components=2*img2.shape[1], alpha=1, n_iter=100)
V = dico.fit(data).components_
t1 = time() - t0
print('temps fit dico: %.fs ' % t1)

# définition des algos de transformations (OMP avec 1 et 2 atomes, LAR regression 5 atomes, et autre chose )
transform_algorithms = [('omp5', 'omp',{'transform_n_nonzero_coefs': 5})]

#} création de plusieurs images reconstruites stockées dans un dictionnaire
def reconstructImages(transform_algorithms):
    reconstructions = {}
    
    for title, transform_algorithm, kwargs in transform_algorithms:
        reconstructions[title] = img2.copy()
        dico.set_params(transform_algorithm=transform_algorithm, **kwargs)
        code = dico.transform(data)
Beispiel #28
0
#print("Patches before reshaping:", patches.shape)

patches = patches.reshape(-1, patch_size[0] * patch_size[1])
#print("Patches after reshaping:", patches.shape)

patches -= np.mean(patches, axis=0)
patches /= np.std(patches, axis=0)
print('done in %.2fs.' % (time() - t0))
print(patches.shape)

# Learn the dictionary from reference patches

print('Learning the dictionary...')
t0 = time()
dico = MiniBatchDictionaryLearning(
    n_components=400, alpha=0.5,
    n_iter=400)  #TODO:check with different parameters
V = dico.fit(patches).components_
dt = time() - t0
print('done in %.2fs.' % dt)

plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(V[:100]):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape(patch_size),
               cmap=plt.cm.gray_r,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
plt.suptitle('Dictionary learned from patches\n' +
             'Train time %.1fs on %d patches' % (dt, len(patches)),
def train(image_paths):

    results = np.zeros((atoms, lowPatchSize[0] * lowPatchSize[1] +
                        highPatchSize[0] * highPatchSize[1]))
    init_dict = True

    for image_path in image_paths:
        # import image
        try:
            print(image_path)
            img = img_tl.importImage(image_path)
        except IOError:
            print(
                "Error, an image could not be found in the images directory. Did you move it while the machine was training?"
            )
            sys.exit()

        img_low, img = img_tl.halfImageResolutionForTraining(
            img, lowPatchSize, highPatchSize)
        img = np.array(img)
        img_low = np.array(img_low)

        # for each colour channel in the image
        """ see if we can just train and make only 1 model."""
        for channel in range(img.shape[2]):

            # convert to patches
            high_data = img[:, :, channel]
            low_data = img_low[:, :, channel]
            high_data = img_tl.convertImageDataToPatches(
                high_data, highPatchSize, 2)
            low_data = img_tl.convertImageDataToPatches(low_data, lowPatchSize)

            high_data_size = high_data.shape[1]
            low_data_size = low_data.shape[1]

            # mathematically reduce values to fit algorithm
            high_data *= 1 / math.sqrt(high_data_size)
            low_data *= 1 / math.sqrt(low_data_size)

            # join the high and low res data
            data = np.concatenate((high_data, low_data), axis=1)

            # train
            trainer = None
            if (init_dict):
                trainer = MiniBatchDictionaryLearning(
                    n_components=atoms,
                    alpha=lmbda * (1 / high_data_size + 1 / low_data_size),
                    n_iter=iterations,
                    n_jobs=-1,
                    verbose=1)
            else:
                trainer = MiniBatchDictionaryLearning(
                    n_components=atoms,
                    alpha=lmbda * (1 / high_data_size + 1 / low_data_size),
                    n_iter=iterations,
                    n_jobs=-1,
                    verbose=1,
                    dict_init=results)

            model = trainer.fit(data).components_

            results = model

        init_dict = False

    # save the result
    resultHigh = results[:, :highPatchSize[0] * highPatchSize[1]]
    resultLow = results[:, highPatchSize[0] * highPatchSize[1]:]
    np.save("models/sparseHigh.npy", resultHigh)
    np.save("models/sparseLow.npy", resultLow)

    plt.figure(figsize=(4.2, 4))
    for i, comp in enumerate(resultHigh[:100]):
        plt.subplot(10, 10, i + 1)
        plt.imshow(comp.reshape(highPatchSize),
                   cmap=plt.cm.gray_r,
                   interpolation='nearest')
        plt.xticks(())
        plt.yticks(())
    plt.show()
    for i, comp in enumerate(resultLow[:100]):
        plt.subplot(10, 10, i + 1)
        plt.imshow(comp.reshape(lowPatchSize),
                   cmap=plt.cm.gray_r,
                   interpolation='nearest')
        plt.xticks(())
        plt.yticks(())
    plt.show()
Beispiel #30
0
    # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified
    # instead of comparing directly U and V.
    assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol)
    assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol)
    assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol)
    # verify an obtained solution is not degenerate
    assert np.mean(U_64 != 0.0) > 0.05
    assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0)


@pytest.mark.parametrize(
    "estimator",
    [
        SparseCoder(X.T),
        DictionaryLearning(),
        MiniBatchDictionaryLearning(batch_size=4, max_iter=10),
    ],
    ids=lambda x: x.__class__.__name__,
)
def test_get_feature_names_out(estimator):
    """Check feature names for dict learning estimators."""
    estimator.fit(X)
    n_components = X.shape[1]

    feature_names_out = estimator.get_feature_names_out()
    estimator_name = estimator.__class__.__name__.lower()
    assert_array_equal(
        feature_names_out,
        [f"{estimator_name}{i}" for i in range(n_components)],
    )
Beispiel #31
0
from sklearn.decomposition import MiniBatchDictionaryLearning, DictionaryLearning
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.preprocessing import normalize

f = h5py.File(sys.argv[1],'r')

features = []
V = []
X = f['data']
Y = f['label']
X = np.float64(X)
print "learn the dictionary"
# dico = DictionaryLearning(n_components=512, alpha=1, max_iter=20, verbose = 20)
# dico.fit(X)
dico = MiniBatchDictionaryLearning(n_components=512, alpha=1, batch_size = 32, n_iter=3000)
for i in range(100):
	dico.partial_fit(X)
	print "epoch " +str(i) + " done"
	code = dico.transform(X)
	error = X - np.dot(code, dico.components_)
	print "error = ", np.sum(error)
	joblib.dump(dico, 'model/dico_model_batch_iter' + str(i) + '.pkl')
#dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500, verbose = 20)
#dico.fit(X)
print "learn over"
V = dico.components_
# f_out = h5py.File(sys.argv[2],'w')
# f_out['dico'] = V
# f_out.close()
joblib.dump(dico, 'dico_model_batch.pkl') 
for a in alpha_range:
	for n in n_range:

		if a > n:
			continue;

		it = 100;
		best_c = 100 # previously calculated through cross validation code for logisctic regression

		## train_data
		if not os.path.isfile(fraud_data_path + 'train_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it)):
			print('## Creating train_data_sparse_a{:d}_c{:d}_it{:d}'.format(a,n,it))
			print >> results_file, '## Creating train_data_sparse_a{:d}_c{:d}_it{:d}'.format(a,n,it)
			train_data_std = preprocessing.scale(train_data.values)
			train_data = pd.DataFrame(train_data_std, index=train_data.index.values)
			miniBatch = MiniBatchDictionaryLearning(n_components=n, alpha=a, n_iter=100)
			dictionary = miniBatch.fit(train_data.values).components_
			dictionary_df = pd.DataFrame(dictionary)
			dictionary_df.to_csv(fraud_data_path + 'train_data_dictionary_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True)
			sparseCode = miniBatch.transform(train_data.values)
			sparseCode_df = pd.DataFrame(sparseCode, index=train_data.index.values)
			sparseCode_df.to_csv(fraud_data_path + 'train_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True)
			denoised = np.dot(sparseCode, dictionary)
			denoised_df = pd.DataFrame(denoised, index=train_data.index.values)
			denoised_df.to_csv(fraud_data_path + 'train_data_denoised_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it), index=True)
		
		## test_data
		if not os.path.isfile(fraud_data_path + 'test_data_sparse_a{:d}_c{:d}_it{:d}.csv'.format(a,n,it)):
			print('## Creating test_data_denoised_a{:d}_c{:d}_it{:d}'.format(a,n,it))
			print >> results_file, '## Creating test_data_denoised_a{:d}_c{:d}_it{:d}'.format(a,n,it)
			test_data_std = preprocessing.scale(test_data.values)
Beispiel #33
0
def test_dict_learning_online_estimator_shapes():
    n_components = 5
    dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0)
    dico.fit(X)
    assert dico.components_.shape == (n_components, n_features)
# Extract all reference patches from the left half of the image
print('Extracting reference patches...')
t0 = time()
patch_size = (7, 7)
data = extract_patches_2d(distorted[:, :width // 2], patch_size)
data = data.reshape(data.shape[0], -1)
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)
print('done in %.2fs.' % (time() - t0))

# #############################################################################
# Learn the dictionary from reference patches

print('Learning the dictionary...')
t0 = time()
dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500)
V = dico.fit(data).components_
dt = time() - t0
print('done in %.2fs.' % dt)

plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(V[:100]):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape(patch_size),
               cmap=plt.cm.gray_r,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
plt.suptitle('Dictionary learned from face patches\n' +
             'Train time %.1fs on %d patches' % (dt, len(data)),
             fontsize=16)
Beispiel #35
0
class SparseApproxSpectrum(object):
    def __init__(self, n_components=49, patch_size=(8,8), max_samples=1000000, **kwargs):
        self.omp = OrthogonalMatchingPursuit()
        self.n_components = n_components
        self.patch_size = patch_size
        self.max_samples = max_samples
        self.D = None
        self.data = None
        self.components = None
        self.standardize=False

    def _extract_data_patches(self, X):
        self.X = X
        data = extract_patches_2d(X, self.patch_size)
        data = data.reshape(data.shape[0], -1)
        if len(data)>self.max_samples:
            data = np.random.permutation(data)[:self.max_samples]
        print data.shape
        if self.standardize:
            self.mn = np.mean(data, axis=0) 
            self.std = np.std(data, axis=0)
            data -= self.mn
            data /= self.std
        self.data = data

    def extract_codes(self, X, standardize=False):
        self.standardize=standardize
        self._extract_data_patches(X)
        self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, alpha=1, n_iter=500)
        print "Dictionary learning from data..."
        self.D = self.dico.fit(self.data)
        return self

    def plot_codes(self, cbar=False, **kwargs):
        #plt.figure(figsize=(4.2, 4))
        N = int(np.ceil(np.sqrt(self.n_components)))
        kwargs.setdefault('cmap', pl.cm.gray_r)
        kwargs.setdefault('origin','bottom')
        kwargs.setdefault('interpolation','nearest')
        for i, comp in enumerate(self.D.components_):
            plt.subplot(N, N, i + 1)
            comp  = comp * self.std + self.mn if self.standardize else comp
            plt.imshow(comp.reshape(self.patch_size), **kwargs)
            if cbar:
                plt.colorbar()
            plt.xticks(())
            plt.yticks(())
        plt.suptitle('Dictionary learned from spectrum patches\n', fontsize=16)
        plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

    def extract_audio_dir_codes(self, dir_expr='/home/mkc/exp/FMRI/stimuli/Wav6sRamp/*.wav',**kwargs):
        flist=glob.glob(dir_expr)
        self.X = np.vstack([feature_scale(LogFrequencySpectrum(f, nbpo=24, nhop=1024).X,normalize=1).T for f in flist]).T
        self.D = extract_codes(self.X, **kwargs)
        self.plot_codes(**kwargs)
        return self

    def _get_approximation_coefs(self,data, components):
        w = np.array([self.omp.fit(components.T, d.T).coef_ for d in data])
        return w

    def reconstruct_spectrum(self, w=None, randomize=False):
        data = self.data
        components = self.D.components_
        if w is None:
            self.w = self._get_approximation_coefs(data, components)
            w = self.w
        if self.standardize:
            for comp in components: comp  = comp * self.std + self.mn
        if randomize:
            components = np.random.permutation(components)
        recon = np.dot(w, components).reshape(-1,self.patch_size[0],self.patch_size[1])
        self.X_hat = reconstruct_from_patches_2d(recon, self.X.shape)
        return self

    def reconstruct_individual_spectra(self, w=None, randomize=False, plotting=False, **kwargs):
        self.reconstruct_spectrum(w,randomize)
        w, components = self.w, self.D.components_
        self.X_hat_l = []
        for i in range(len(self.w.T)):
            r=np.array((np.matrix(w)[:,i]*np.matrix(components)[i,:])).reshape(-1,self.patch_size[0],self.patch_size[1])
            self.X_hat_l.append(reconstruct_from_patches_2d(r, self.X.shape))
        if plotting:
            plt.figure()            
            for k in range(self.n_components):
                plt.subplot(self.n_components**0.5,self.n_components**0.5,k+1)
                feature_plot(self.X_hat_l[k],nofig=1,**kwargs)
        return self
    ax.set_title("Separation of Observations using " + algoName)


#----------------------------------------------------------------------------------------------------

# Mini-batch dictionary learning
from sklearn.decomposition import MiniBatchDictionaryLearning

n_components = 50
alpha = 1
batch_size = 200
n_iter = 25
random_state = 2018

miniBatchDictLearning = MiniBatchDictionaryLearning( \
                        n_components=n_components, alpha=alpha, \
                        batch_size=batch_size, n_iter=n_iter, \
                        random_state=random_state)

miniBatchDictLearning.fit(X_train.loc[:, :10000])
X_train_miniBatchDictLearning = miniBatchDictLearning.fit_transform(X_train)
X_train_miniBatchDictLearning = pd.DataFrame( \
    data=X_train_miniBatchDictLearning, index=train_index)

X_validation_miniBatchDictLearning = \
    miniBatchDictLearning.transform(X_validation)
X_validation_miniBatchDictLearning = \
    pd.DataFrame(data=X_validation_miniBatchDictLearning, \
    index=validation_index)

scatterPlot(X_train_miniBatchDictLearning, y_train, \
            "Mini-batch Dictionary Learning")
Beispiel #37
0
def test_minibatch_dict_learning_wrong_params(param, match):
    # Check that error are raised with clear error message when wrong values
    # are passed for the parameters of MiniBatchDictionaryLearning
    with pytest.raises(ValueError, match=match):
        MiniBatchDictionaryLearning(**param).fit(X)
Beispiel #38
0
 def fit(self, X=None, y=None):
     if self.patch_file is None:
         num = self.patch_num // X.size
         data = []
         for item in X:
             img = imread(str(item[0]))
             img = img_as_ubyte(rgb2gray(img))
             #img = self.binary(img) # 二值化
             tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\
                                     random_state=np.random.RandomState())
             data.append(tmp)
         
         data = np.vstack(data)
         data = data.reshape(data.shape[0], -1)
         data = np.asarray(data, 'float32')
     else:
         data = np.load(self.patch_file,'r+') # load npy file, 注意模式,因为后面需要修改
     
     data = np.require(data, dtype=np.float32)
     
     # Standardization
     #logging.info("Pre-processing : Standardization...")
     #self.standard = StandardScaler()
     #data = self.standard.fit_transform(data)
         
     # whiten
     #logging.info("Pre-processing : PCA Whiten...")
     #self.pca = RandomizedPCA(copy=True, whiten=True)
     #data = self.pca.fit_transform(data)
     
     # whiten
     logging.info("Pre-processing : ZCA Whiten...")
     self.zca = ZCA()
     data = self.zca.fit_transform(data)
     
     # 0-1 scaling 都可以用preprocessing模块实现
     #self.minmax = MinMaxScaler()
     #data = self.minmax.fit_transform(data)
     
     """k-means
     self.kmeans = MiniBatchKMeans(n_clusters=self.n_components, init='k-means++', \
                                 max_iter=self.n_iter, batch_size=self.batch_size, verbose=1,\
                                 tol=0.0, max_no_improvement=100,\
                                 init_size=None, n_init=3, random_state=np.random.RandomState(0),\
                                 reassignment_ratio=0.0001)
     logging.info("Sparse coding : Phase 1 - Codebook learning (K-means).")
     self.kmeans.fit(data)
     
     logging.info("Sparse coding : Phase 2 - Define coding method (omp,lars...).")
     self.coder = SparseCoder(dictionary=self.kmeans.cluster_centers_, 
                              transform_n_nonzero_coefs=256,
                              transform_alpha=None, 
                              transform_algorithm='lasso_lars',
                              n_jobs = 1)
     """
     #'''genertic
     logging.info("Sparse coding...")
     self.coder = MiniBatchDictionaryLearning(n_components=self.n_components, \
                                        alpha=self.alpha, n_iter=self.n_iter, \
                                        batch_size =self.batch_size, verbose=True)
     self.coder.fit(data)
     self.coder.transform_algorithm = 'omp'
     self.coder.transform_alpha = 0.1 # omp情况下,代表重建的误差
     #'''
     return self
Beispiel #39
0
	def codebook(self):
		self.mbdl =  MiniBatchDictionaryLearning(self.N_codebook)
		self.mbdl.fit(self.raw_features)
Beispiel #40
0
class Sparsecode(BaseEstimator, TransformerMixin):
    def __init__(self, patch_file=None, patch_num=10000, patch_size=(16, 16),\
                n_components=384,  alpha = 1, n_iter=1000, batch_size=200):
        self.patch_num = patch_num
        self.patch_size = patch_size
        self.patch_file = patch_file
        
        self.n_components = n_components
        self.alpha = alpha #sparsity controlling parameter
        self.n_iter = n_iter
        self.batch_size = batch_size

    
    def fit(self, X=None, y=None):
        if self.patch_file is None:
            num = self.patch_num // X.size
            data = []
            for item in X:
                img = imread(str(item[0]))
                img = img_as_ubyte(rgb2gray(img))
                #img = self.binary(img) # 二值化
                tmp = extract_patches_2d(img, self.patch_size, max_patches = num,\
                                        random_state=np.random.RandomState())
                data.append(tmp)
            
            data = np.vstack(data)
            data = data.reshape(data.shape[0], -1)
            data = np.asarray(data, 'float32')
        else:
            data = np.load(self.patch_file,'r+') # load npy file, 注意模式,因为后面需要修改
        
        data = np.require(data, dtype=np.float32)
        
        # Standardization
        #logging.info("Pre-processing : Standardization...")
        #self.standard = StandardScaler()
        #data = self.standard.fit_transform(data)
            
        # whiten
        #logging.info("Pre-processing : PCA Whiten...")
        #self.pca = RandomizedPCA(copy=True, whiten=True)
        #data = self.pca.fit_transform(data)
        
        # whiten
        logging.info("Pre-processing : ZCA Whiten...")
        self.zca = ZCA()
        data = self.zca.fit_transform(data)
        
        # 0-1 scaling 都可以用preprocessing模块实现
        #self.minmax = MinMaxScaler()
        #data = self.minmax.fit_transform(data)
        
        """k-means
        self.kmeans = MiniBatchKMeans(n_clusters=self.n_components, init='k-means++', \
                                    max_iter=self.n_iter, batch_size=self.batch_size, verbose=1,\
                                    tol=0.0, max_no_improvement=100,\
                                    init_size=None, n_init=3, random_state=np.random.RandomState(0),\
                                    reassignment_ratio=0.0001)
        logging.info("Sparse coding : Phase 1 - Codebook learning (K-means).")
        self.kmeans.fit(data)
        
        logging.info("Sparse coding : Phase 2 - Define coding method (omp,lars...).")
        self.coder = SparseCoder(dictionary=self.kmeans.cluster_centers_, 
                                 transform_n_nonzero_coefs=256,
                                 transform_alpha=None, 
                                 transform_algorithm='lasso_lars',
                                 n_jobs = 1)
        """
        #'''genertic
        logging.info("Sparse coding...")
        self.coder = MiniBatchDictionaryLearning(n_components=self.n_components, \
                                           alpha=self.alpha, n_iter=self.n_iter, \
                                           batch_size =self.batch_size, verbose=True)
        self.coder.fit(data)
        self.coder.transform_algorithm = 'omp'
        self.coder.transform_alpha = 0.1 # omp情况下,代表重建的误差
        #'''
        return self
    
    def transform(self, X):
        #whiten
        #X_whiten = self.pca.transform(X)
        logging.info("Compute the sparse coding of X.")
        X = np.require(X, dtype=np.float32)
        
        #TODO: 是否一定需要先fit,才能transform
        #X = self.minmax.fit_transform(X)
        
        # -mean/std and whiten
        #X = self.standard.transform(X)
        #X = self.pca.transform(X)
        
        # ZCA
        X = self.zca.transform(X)

        # MiniBatchDictionaryLearning
        # return self.dico.transform(X_whiten)
        
        # k-means
        # TODO: sparse coder method? problem...
        return self.coder.transform(X)
        
    
    def get_params(self, deep=True):
        return {"patch_num": self.patch_num,
                "patch_size":self.patch_size,
                "alpha":self.alpha,
                "n_components":self.n_components,
                "n_iter":self.n_iter,
                "batch_size":self.batch_size}
                
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.__setattr__(parameter, value)
        return self
def dictionary_learning_MHOF(flow_hist_H_400):
    from sklearn.decomposition import MiniBatchDictionaryLearning
    dico=MiniBatchDictionaryLearning(n_components=400,alpha=1,n_iter=500)
    dic=dico.fit(flow_hist_H_400).components_
    #coeffs=dico.transform(flow_hist_H_400)
    return dic
    # and (column permutated U*, row permutated V*) are also optional
    # as long as holding UV.
    # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified
    # instead of comparing directly U and V.
    assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol)
    assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol)
    assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol)
    # verify an obtained solution is not degenerate
    assert np.mean(U_64 != 0.0) > 0.05
    assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0)


@pytest.mark.parametrize(
    "estimator",
    [SparseCoder(X.T),
     DictionaryLearning(),
     MiniBatchDictionaryLearning()],
    ids=lambda x: x.__class__.__name__,
)
def test_get_feature_names_out(estimator):
    """Check feature names for dict learning estimators."""
    estimator.fit(X)
    n_components = X.shape[1]

    feature_names_out = estimator.get_feature_names_out()
    estimator_name = estimator.__class__.__name__.lower()
    assert_array_equal(
        feature_names_out,
        [f"{estimator_name}{i}" for i in range(n_components)],
    )
Beispiel #43
0
em.lparams = model_params
em.run()

dlog.close(True)
pprint("Done")


# ### Mini-Batch Dictionary Learning
# 
# Alternative, since the EM library gives numerical errors

# In[20]:

from sklearn.decomposition import MiniBatchDictionaryLearning

mbdic = MiniBatchDictionaryLearning(n_components=30,verbose=True)
mbdic.fit(patches_flat)


# ### Visualize the dictionary atoms

# In[21]:

V = mbdic.components_
plt.figure(figsize=(15,12))
for i,comp in enumerate(V):
    plt.subplot(10,10,i+1)
    plt.imshow(comp.reshape(patchsize).T,origin='lower',interpolation='nearest',aspect='auto',cmap='viridis')
    

Beispiel #44
0
# filter by a class
if p.class_num is None:
    X = X_test
    Y = Y_test
else:
    idxs = Y_test == p.class_num
    X = X_test[idxs]
    Y = Y_test[idxs]

X_d = X.reshape(X.shape[0], -1)
print(X_d.shape)

n_iter = int(1000 / p.batch_size)
dico = MiniBatchDictionaryLearning(n_components=p.num_bases,
                                   alpha=p.alpha,
                                   n_iter=n_iter,
                                   n_jobs=1,
                                   batch_size=p.batch_size)
save_freq = 100
for i in tqdm(range(50000)):
    V = dico.fit(X_d)
    if i % save_freq == 0:
        s = '_alpha=' + str(p.alpha) + '_ncomps=' + str(
            p.num_bases) + '_class=' + str(p.class_num)
        fname1 = 'bases/bases_iters=' + str(i) + s + '.npy'
        np.save(fname1, V.components_)
        fname2 = 'bases/bases_iters=' + str(i - save_freq) + s + '.npy'
        viz_weights.plot_weights(V.components_, dset='rgb')
        fname3 = 'bases_figs/bases_iters=' + str(i - save_freq) + s + '.png'
        plt.savefig('bases_figs/bases_iters=' + str(i) + s + '.png',
                    dpi=200,
Beispiel #45
0
    # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified
    # instead of comparing directly U and V.
    assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol)
    assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol)
    assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol)
    # verify an obtained solution is not degenerate
    assert np.mean(U_64 != 0.0) > 0.05
    assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0)


@pytest.mark.parametrize(
    "estimator",
    [
        SparseCoder(X.T),
        DictionaryLearning(),
        MiniBatchDictionaryLearning(batch_size=4)
    ],
    ids=lambda x: x.__class__.__name__,
)
def test_get_feature_names_out(estimator):
    """Check feature names for dict learning estimators."""
    estimator.fit(X)
    n_components = X.shape[1]

    feature_names_out = estimator.get_feature_names_out()
    estimator_name = estimator.__class__.__name__.lower()
    assert_array_equal(
        feature_names_out,
        [f"{estimator_name}{i}" for i in range(n_components)],
    )
Beispiel #46
0
def run_dimension_reductions():
    global mean
    for dataset in [Diabetes(), Adult()]:
        processor = Processor3()
        processor.latext_start_figure()
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        pca = PCA(n_components=0.95)
        pca.fit(X_train)
        n_components = pca.components_.shape[0]
        print(f"n_components: {n_components}")

        whiten = True
        random_state = 0
        dr_models = [
            PCA(n_components=n_components, random_state=0),
            FastICA(n_components=n_components, random_state=0),
            MiniBatchDictionaryLearning(n_components=n_components,
                                        alpha=1,
                                        batch_size=200,
                                        n_iter=10,
                                        random_state=random_state),
            SparseRandomProjection(random_state=0, n_components=n_components)
        ]
        for pca in dr_models:
            X_train = pd.DataFrame(X_train)
            y_train = pd.DataFrame(y_train)

            if isinstance(pca, SparseRandomProjection):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_.todense())
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            elif isinstance(pca, MiniBatchDictionaryLearning):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_) + np.array(X_train.mean(axis=0))
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            else:
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)

            # plt.show()

            anomalyScoresPCA = anomalyScores(X_train, X_train_PCA_inverse)
            mean = np.mean(anomalyScoresPCA)
            print(mean)
            preds = plotResults(y_train, anomalyScoresPCA, True,
                                pca.__class__.__name__,
                                dataset.__class__.__name__, mean)
        processor.latex_end_figure(
            caption=f"{dataset.__class__.__name__} Precision-Recall Curve",
            fig=f"pr_{dataset.__class__.__name__}")
Beispiel #47
0
class Layer(object):

    def __init__(self, hierarchy, depth, patch_size, num_features, num_patches, multiplier):
        """
         * depth - hierarchy level (1, 2, 3, etc.)
         * patch_size - number of pixels representing side of the square patch.
           like, 8 (8x8 patches)
         * num_features - how many components to learn
         * multiplier - num of subpatches we break patch into
           (0 for the first level). if 3, patch will contant 3x3 subpatches.
        """
        self.hierarchy = hierarchy
        self.depth = depth
        self.basement_size = patch_size
        self.num_features = num_features
        self.num_patches = num_patches
        self.multiplier = multiplier
        self.learning = MiniBatchDictionaryLearning(
            n_components=num_features, n_iter=3000, transform_algorithm='lasso_lars', transform_alpha=0.5, n_jobs=2)
        self.ready = False

    def get_data(self, data, max_patches=None):
        """
        Extracts raw data from patches.
        """
        max_patches = max_patches or self.num_patches
        if isinstance(data, np.ndarray):
            # one image
            patches = extract_patches_2d(
                data, (self.basement_size, self.basement_size), max_patches=max_patches)
        else:
            patches = []
            # multiple images
            for i in xrange(max_patches):
                idx = np.random.randint(len(data))  # selecting random image
                dx = dy = self.basement_size
                if data[idx].shape[0] <= dx or data[idx].shape[1] <= dy:
                    continue
                x = np.random.randint(data[idx].shape[0] - dx)
                y = np.random.randint(data[idx].shape[1] - dy)
                patch = data[idx][x: x + dx, y: y + dy]
                patches.append(patch.reshape(-1))
            patches = np.vstack(patches)
            patches = patches.reshape(patches.shape[0], self.basement_size, self.basement_size)
        print 'patches', patches.shape
        patches = preprocessing.scale(patches)
        return patches

    def learn(self, data):
        data = data.reshape(data.shape[0], -1)
        self.learning.fit(data)
        self.ready = True

    @property
    def output_size(self):
        return int(np.sqrt(self.num_features))

    @property
    def input_size(self):
        if self.depth == 0:
            return self.basement_size
        else:
            prev_layer = self.hierarchy.layers[self.depth - 1]
            r = prev_layer.output_size * self.multiplier
            return r
        return self._input_size

    @property
    def features(self):
        return self.learning.components_

    # def get_features(self):
    #     # going from up to down
    #     result = []
    #     layers = self.hierarchy.layers[: self.depth][::-1]
    #     if self.depth == 0:
    #         return self.features

    #     previous_layer = self.hierarchy.layers[self.depth - 1]
    #     for feature in self.features:
    #         multiplier = self.multiplier
    #         feature = feature.reshape(self.multiplier * previous_layer.output_size,
    #                                   self.multiplier * previous_layer.output_size,)
    #         for other_layer in layers:
    #             expressed_feature = np.empty((multiplier * other_layer.input_size,
    #                                           multiplier * other_layer.input_size))
    #             enc_n = other_layer.output_size
    #             n = other_layer.input_size
    #             for dx in range(multiplier):
    #                 for dy in range(multiplier):
    #                     encoded_subfeature = feature[dx * enc_n: (dx + 1) * enc_n,
    #                                                  dy * enc_n: (dy + 1) * enc_n]
    #                     prev_patch = np.dot(encoded_subfeature.reshape(-1), other_layer.features)
    #                     expressed_feature[dx * n: (dx + 1) * n, dy * n: (dy + 1) * n] = prev_patch.reshape(n, n)
    #             feature = expressed_feature
    #             multiplier *= other_layer.multiplier
    #         result.append(expressed_feature.reshape(-1))
    #     result = np.vstack(result)
    #     return result

    def get_features(self):
        # going from down to up. these two methods are look like the same
        if self.depth == 0:
            return self.features
        layers = self.hierarchy.layers[1: self.depth + 1]  # down --> up
        features = self.hierarchy.layers[0].features  # to express upper feature

        for i, layer in enumerate(layers, start=1):
            previous_layer = self.hierarchy.layers[i - 1]
            expressed_features = []
            for feature in layer.features:
                n = previous_layer.output_size
                m = int(np.sqrt(features.shape[1]))
                feature = feature.reshape((layer.input_size, layer.input_size))
                expressed_feature = np.empty((layer.multiplier * m,
                                              layer.multiplier * m))
                for dx in range(layer.multiplier):
                    for dy in range(layer.multiplier):
                        subfeature = feature[dx * n: (dx + 1) * n, dy * n: (dy + 1) * n]
                        # now that's previous_layer's code. replace it with reconstruction
                        expressed_subfeature = np.dot(subfeature.reshape(-1), features)
                        expressed_feature[dx * m: (dx + 1) * m, dy * m: (dy + 1) * m] = expressed_subfeature.reshape((m, m))
                expressed_features.append(expressed_feature.reshape(-1))
            features = np.vstack(expressed_features)
        return features
Beispiel #48
0
# Dimensionality reduction with PCA
pca = PCA(n_components=49)  # 47 changed to 49 for display purposes
pca.fit(patches_l)
patches_l = pca.transform(patches_l)

########################################################
################## DICTIONARY LEARNING #################
########################################################

# Dictionary learning on low resolution patches
print('Learning the low resolution dictionary...')
t0 = time.time()

#dico = DictionaryLearning(n_components=1000, alpha=0.1, max_iter=40)
dico = MiniBatchDictionaryLearning(n_components=1000, alpha=0.1,
                                   n_iter=40)  # try with less iterations
#np.save('C:/Users/Nikolina Mileva/Documents/Sparseland/dico.npy', dico)
#V = dico.fit(patches_l).components_
#np.save('C:/Users/Nikolina Mileva/Documents/Sparseland/dictionary_l.npy', V)

# Load and visualize the low resolution dictionary
#V = np.load('C:/Users/Nikolina Mileva/Documents/Sparseland/dictionary_l.npy')

#patch_size = (7,7) # patch size after PCA
#plt.figure(figsize=(4.2, 4))
#for i, comp in enumerate(V[:100]):
#    plt.subplot(10, 10, i + 1)
#    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
#               interpolation='nearest')
#    plt.xticks(())
#    plt.yticks(())
Beispiel #49
0
    targets = target_int

    # dimensionality scaling
    #pca_feat = imagenet_features
    pca = PCA(n_components=np.size(features,1))
    pca.fit(imagenet_features)
    pca_feat = pca.transform(imagenet_features)
    
    # Shufflinig
    ind = range(len(imagenet_targets))
    np.random.shuffle(ind)
    imagenet_targets=imagenet_targets[ind]
    pca_feat=pca_feat[ind,:]        
    
    # Dictionary Learning on Source   
    dict_sparse = MiniBatchDictionaryLearning(alpha=1, n_components=sparse_components, verbose=3, batch_size=10, n_iter = 1000)
    dict_sparse.fit(pca_feat)
    Ds_0 = dict_sparse.components_
    
    # Dictionary Learning on Target
    dict_sparse = DictionaryLearning(alpha=1, n_components=sparse_components, max_iter=3, verbose=3)
    dict_sparse.fit(features)
    Dt_0 = dict_sparse.components_
    coder = SparseCoder(dictionary=Dt_0)
    Rt_0 = coder.transform(features)
    
    # Target Reconstruction
    Xt_1 = np.mat(Rt_0) * np.mat(Ds_0)
    dict_sparse = DictionaryLearning(alpha=1, n_components=sparse_components, max_iter=3, verbose=3)
    dict_sparse.fit(Xt_1)
    Dt_1 = dict_sparse.components_
    ax.set_title("Separation of Observations using " + algoName)


#---------------------------------------------------------------------------------------

# Mini-batch dictionary learning
from sklearn.decomposition import MiniBatchDictionaryLearning

n_components = 28
alpha = 1
batch_size = 200
n_iter = 10
random_state = 2018

miniBatchDictLearning = MiniBatchDictionaryLearning( \
    n_components=n_components, alpha=alpha, batch_size=batch_size, \
    n_iter=n_iter, random_state=random_state)

miniBatchDictLearning.fit(X_train)
X_test_miniBatchDictLearning = miniBatchDictLearning.transform(X_test)
X_test_miniBatchDictLearning = \
    pd.DataFrame(data=X_test_miniBatchDictLearning, index=X_test.index)

scatterPlot(X_test_miniBatchDictLearning, y_test, \
            "Mini-batch Dictionary Learning")
plt.show()

# 再構成
X_test_miniBatchDictLearning_inverse = \
    np.array(X_test_miniBatchDictLearning). \
    dot(miniBatchDictLearning.components_)
# Extract all reference patches from the left half of the image
print('Extracting reference patches...')
t0 = time()
patch_size = (7, 7)
data = extract_patches_2d(distorted[:, :height // 2], patch_size)
data = data.reshape(data.shape[0], -1)
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)
print('done in %.2fs.' % (time() - t0))

###############################################################################
# Learn the dictionary from reference patches

print('Learning the dictionary...')
t0 = time()
dico = MiniBatchDictionaryLearning(n_components=100, alpha=1, n_iter=500)
V = dico.fit(data).components_
dt = time() - t0
print('done in %.2fs.' % dt)

plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(V[:100]):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape(patch_size), cmap=plt.cm.gray_r,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
plt.suptitle('Dictionary learned from Lena patches\n' +
             'Train time %.1fs on %d patches' % (dt, len(data)),
             fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
Beispiel #52
0
def learn_dictionary_and_encode(data,
                                n_atoms=20,
                                alpha=0.5,
                                n_iter=200,
                                random_seed=42,
                                n_jobs=1,
                                fit_algorithm='cd',
                                transform_algorithm='lasso_cd'):
    r"""
    Will learn a dictionary for the data (row-wise) and encode it, with the specified parameters.
    Returns the dictionary, components as a dataframe, and encoded data.

    By default, allows 20 words with alpha of 0.5.

    More info at : https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchDictionaryLearning.html

    >>> from pygtftk.stats.intersect.modl.dict_learning import test_data_for_modl
    >>> from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
    >>> import numpy as np
    >>> np.random.seed(42)
    >>> flags_matrix = test_data_for_modl(nflags = 1000, number_of_sets = 6)
    >>> import time
    >>> start_time = time.time()
    >>> U_df, V_df, error = learn_dictionary_and_encode(flags_matrix, n_atoms = 20, alpha = 0.5)
    >>> stop_time = time.time()

    """

    # TODO: Many operations here, such as recreating an object of a pandas
    # dataframe, might not be necessary ?

    data = np.array(data)  # Force cast as array

    # Cannot ask for more rows than there are unique lines
    n_atoms = min(n_atoms, len(np.unique(data, axis=0)))

    dico = MiniBatchDictionaryLearning(n_components=n_atoms,
                                       n_iter=n_iter,
                                       alpha=alpha,
                                       fit_algorithm=fit_algorithm,
                                       transform_algorithm=transform_algorithm,
                                       transform_alpha=alpha,
                                       positive_dict=True,
                                       positive_code=True,
                                       random_state=random_seed,
                                       n_jobs=n_jobs)

    # NOTE fit_algorithm is used during the learning and transform_algorithm
    # transforms the data once the estimator has been fitted.
    # We are using coordinate descent (CD) as LARS has troubles with correlated
    # features. Also because we want to be able to enforce positivity in the
    # dictionary and the code for interpretability.

    dico.fit(data)  # Fit the data

    # Get components (the dictionary).
    # NOTE: Use a try-except for future proofing, as sklearn (v 0.24) seems to
    # deprecate 'components_' across the board and replaced it with 'dictionary'
    try:
        V = dico.components_
    except:
        V = dico.dictionary
    V_df = pd.DataFrame(V)

    # If the alpha is inadapted, the learned dictionary may have failed to converge
    # and contain NaNs. If that happens, return it now and bypass the next steps
    # NOTE LARS is less vulnerable to it, use it as a fallback before calling it quits.
    if np.isnan(V).any():
        message(
            "Learned dictionary by Coordinate descent contained NaNs. Alpha may have been indadapted. Defaulting to LARS.",
            type='DEBUG')

        dico = MiniBatchDictionaryLearning(n_components=n_atoms,
                                           n_iter=n_iter,
                                           alpha=alpha,
                                           transform_alpha=alpha,
                                           random_state=random_seed,
                                           n_jobs=n_jobs,
                                           fit_algorithm='lars',
                                           transform_algorithm='lasso_lars')

        dico.fit(data)
        try:
            V = dico.components_
        except:
            V = dico.dictionary
        V_df = pd.DataFrame(V)

        # Only abort if it still does not work
        if np.isnan(V).any():
            message("Fallback still contains NaNs. Aborting.", type='DEBUG')
            return None, V_df, None  # Return "U", V, "error"

    # Re-encode the data with this dictionary
    encoded_data = dico.transform(data)
    ed_df = pd.DataFrame(encoded_data)

    # Compute associated normalized L2 loss for reference
    reconstructed_features = np.matmul(encoded_data, V)
    error = np.sum((reconstructed_features - data)**2) / np.sum(data**2)

    return ed_df, V_df, error  # Return U, V, error