def RunSparseCodingScikit(q): totalTimer = Timer() # Load input dataset. inputData = np.genfromtxt(self.dataset[0], delimiter=',') dictionary = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. l = re.search("-l (\d+)", options) l = 0 if not l else int(l.group(1)) try: with totalTimer: # Perform Sparse Coding. model = SparseCoder(dictionary=dictionary, transform_algorithm='lars', transform_alpha=l) code = model.transform(inputData) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def sparse_code(self, X, W): ''' Given data matrix X and dictionary matrix W, find code matrix H such that W*H approximates X args: X (numpy array): data matrix with dimensions: data_dim (d) x samples (n) W (numpy array): dictionary matrix with dimensions: data_dim (d) x topics (r) returns: H (numpy array): code matrix with dimensions: topics (r) x samples(n) ''' if DEBUG: print('sparse_code') print('X.shape:', X.shape) print('W.shape:', W.shape, '\n') # extract matrix dimensions from X, W # and initialize H with appropriate dimensions d, n = np.shape(X) d, r = np.shape(W) H = np.zeros([n, r]) # initialize the SparseCoder with W as its dictionary # then find H such that X \approx W*H coder = SparseCoder(dictionary=W.T, transform_n_nonzero_coefs=None, transform_alpha=2, transform_algorithm='lasso_lars', positive_code=False) H = coder.transform(X.T) # transpose H before returning to undo the preceding transpose on X return H.T
def main(): images, labels = load_labeled_training(flatten=True) images = standardize(images) unl = load_unlabeled_training(flatten=True) unl = standardize(unl) test = load_public_test(flatten=True) test = standardize(test) shuffle_in_unison(images, labels) #d = DictionaryLearning().fit(images) d = MiniBatchDictionaryLearning(n_components=500, n_iter=500, verbose=True).fit(images) s = SparseCoder(d.components_) proj_test = s.transform(images) pt = s.transform(test) #kpca = KernelPCA(kernel="rbf") #kpca.fit(unl) #test_proj = kpca.transform(images) #pt = kpca.transform(test) #spca = SparsePCA().fit(unl) #test_proj = spca.transform(images) #pt = spca.transform(test) svc = SVC() scores = cross_validation.cross_val_score(svc, proj_test, labels, cv=10) print scores print np.mean(scores) print np.var(scores) svc.fit(proj_test, labels) pred = svc.predict(pt) write_results(pred, '../svm_res.csv')
def RunSparseCodingScikit(): totalTimer = Timer() # Load input dataset. inputData = np.genfromtxt(self.dataset[0], delimiter=',') dictionary = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. opts = {} if "lambda" in options: opts["transform_alpha"] = options.pop("lambda") if "max_iterations" in options: opts["max_iter"] = options.pop("max_iterations") opts["transform_algorithm"] = "lars" opts["dictionary"] = dictionary if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform Sparse Coding. model = SparseCoder(**opts) code = model.transform(inputData) except Exception as e: return -1 return totalTimer.ElapsedTime()
def get_activations(stft, dico, n_nonzero_coefs=None): coder = SparseCoder( dictionary=dico.T, transform_n_nonzero_coefs=n_nonzero_coefs, transform_algorithm="lasso_cd", positive_code=True) return coder.transform(stft.T).T
def omp_sparse(dictionary, train_data): dictionary = dictionary.transpose() w = SparseCoder(dictionary, transform_algorithm='omp') t = w.transform(train_data) return t
def RunSparseCodingScikit(q): totalTimer = Timer() # Load input dataset. inputData = np.genfromtxt(self.dataset[0], delimiter=',') dictionary = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. opts = {} if "lambda" in options: opts["transform_alpha"] = options.pop("lambda") if "max_iterations" in options: opts["max_iter"] = options.pop("max_iterations") opts["transform_algorithm"] = "lars" opts["dictionary"] = dictionary if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform Sparse Coding. model = SparseCoder(**opts) code = model.transform(inputData) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def predict(self, X_test): n_samples = X_test.shape[1] X = X_test D = self.D W = self.W D = np.nan_to_num(D) D1 = np.sum(np.abs(D)**2, axis=0)**(1. / 2) for i in range(D.shape[1]): D[:, i] = D[:, i] / D1[i] #print(D,X) print("predicting") coder = SparseCoder(dictionary=np.transpose(D), transform_n_nonzero_coefs=self.n_nonzero, transform_algorithm='omp') Z = coder.transform(np.transpose(X)) #print(Z.shape, Z) #print(np.count_nonzero(Z)) #print(np.count_nonzero(Z, axis=0)) pred = np.zeros((n_samples, )) for i in range(n_samples): pred[i] = np.argmax(np.dot(W, Z[i, :])) return pred
def joint_sparse_code_tensor(self, X, W): ''' Given data matrix X and dictionary matrix W, find code matrix H such that W*H approximates X args: X (numpy array): data matrix with dimensions: features (d) x samples (n) W (numpy array): dictionary matrix with dimensions: features (d) x topics (r) returns: H (numpy array): code matrix with dimensions: topics (r) x samples(n) ''' if DEBUG: print('sparse_code') print('X.shape:', X.shape) print('W.shape:', W.shape, '\n') # initialize the SparseCoder with W as its dictionary # then find H such that X \approx W*H if self.alpha == None: coder = SparseCoder(dictionary=W.T, transform_n_nonzero_coefs=None, transform_alpha=2, transform_algorithm='lasso_lars', positive_code=True) else: coder = SparseCoder(dictionary=W.T, transform_n_nonzero_coefs=None, transform_alpha=self.alpha, transform_algorithm='lasso_lars', positive_code=True) # alpha = L1 regularization parameter. H = coder.transform(X.T) # transpose H before returning to undo the preceding transpose on X return H
def sparse_coding_with_LSC(Y_labelled,D,H,W,gamma,lamda): _Y = np.vstack((Y_labelled, np.sqrt(gamma) * H)) _D = np.vstack((D, np.sqrt(gamma) * W)) coder = SparseCoder(dictionary=_D.T,transform_alpha=lamda/2., transform_algorithm='lasso_cd') X =(coder.transform(_Y.T)).T return X
def test_sparse_coder(): db = coco.COCO('/media/zawlin/ssd/coco/annotations/captions_train2014.json') ilsvrc_word2vec = zl.load('ilsvrc_word2vec') test_word2vec = zl.load('test_word2vec') D = [] idx2word = [] for i in xrange(len(ilsvrc_word2vec.keys())): idx2word.append(ilsvrc_word2vec.keys()[i]) D.append(ilsvrc_word2vec[ilsvrc_word2vec.keys()[i]]) idx2word = np.array(idx2word) D = np.array(D) print 'loading word2vec' model = gensim.models.Word2Vec.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=5, transform_alpha=None, transform_algorithm='lasso_cd') #random.shuffle(db.anns) for k in db.anns: cap = db.anns[k]['caption'] splited = cap.split(' ') for s in splited: if s.lower() in model: y = model[s.lower()] x = coder.transform(y.reshape(1,-1))[0] print '%s = %s'%(s.lower(),idx2word[np.argsort(x)[::-1]]) print x[np.argsort(x)[::-1]] c = raw_input('press q to quit') if c== 'q': exit(0)
def compute_residual(query, test): """Compute the residual of the sparse coding representation (RSCR) Args: query (torch.Tensor): Query LR test (torch.Tensor): Test LR Returns: float: RSCR """ D = test.squeeze().detach().numpy() Y = query.detach().numpy() X = query if Y.ndim < 2: Y = Y.reshape(1, -1) if D.ndim < 2: D = D.reshape(1, -1) coder = SparseCoder(dictionary=D, transform_algorithm='lasso_lars') A = coder.transform(Y) A = torch.FloatTensor(A) D = torch.FloatTensor(D) RSCR = torch.norm(X - torch.mm(A, D)).item() return RSCR
def SparseRepresentation(X,D,A_mean,param,l1,transform_n_nonzero_coefs): if len(D.shape)>2: featureDim=D.shape[1] D=D.transpose(1,0,2).reshape((featureDim,-1)) # par = Params() # par.lambda1 = param.lambda1; # par.lambda2 = param.lambda2; # D=np.hstack((D[0],D[1],D[2],D[3],D[4])) # A_ini = np.ones((transform_n_nonzero_coefs, X.shape[1])) # par.A_mean = A_mean[0] # if D.shape[0] >= D.shape[1]: # w, v = LA.eig(D.T @ D) # par.c = 1.05 * w.max() # else: # w, v = LA.eig(D @ D.T) # par.c = 1.05 * w.max() # opts = Coef_Update_Test(X, D, A_ini, par) # A_test = opts.A # P = LA.inv(D.T @ D + l1 * np.eye(D.shape[1])) @ D.T # A_test = P @ X coder = SparseCoder(dictionary=D.T, transform_n_nonzero_coefs=transform_n_nonzero_coefs, transform_algorithm="omp") A_test = (coder.transform(X.T)).T A_test_nonzero=A_test # A_test_nonzero=np.empty((transform_n_nonzero_coefs, X.shape[1])) # for i in range(A_test.shape[1]): # A_test_nonzero[:,i] = A_test[:,i][A_test[:,i] != 0] return A_test_nonzero
def reconstruct_image_color(self, loading, recons_resolution=1, if_save=True): print('reconstructing given network...') ''' Reconstruct original color image using lerned CP dictionary atoms ''' A = self.data # A.shape = (row, col, 3) CPdict = self.out(loading) k = self.patch_size W = np.zeros(shape=(3 * k**2, self.n_components)) for j in np.arange(self.n_components): W[:, j] = CPdict.get('A' + str(j)).reshape(-1, 1)[:, 0] A_matrix = A.reshape(-1, A.shape[1]) # (row, col, 3) --> (3row, col) [m, n] = A_matrix.shape A_recons = np.zeros(shape=A.shape) A_overlap_count = np.zeros(shape=(A.shape[0], A.shape[1])) k = self.patch_size t0 = time() c = 0 num_rows = np.floor( (A_recons.shape[0] - k) / recons_resolution).astype(int) num_cols = np.floor( (A_recons.shape[1] - k) / recons_resolution).astype(int) for i in np.arange(0, A_recons.shape[0] - k, recons_resolution): for j in np.arange(0, A_recons.shape[1] - k, recons_resolution): patch = A[i:i + k, j:j + k, :] patch = patch.reshape((-1, 1)) coder = SparseCoder(dictionary=W.T, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='lasso_lars', positive_code=True) # alpha = L1 regularization parameter. alpha=2 makes all codes zero (why?) code = coder.transform(patch.T) patch_recons = np.dot(W, code.T).T patch_recons = patch_recons.reshape(k, k, 3) # now paint the reconstruction canvas for x in itertools.product(np.arange(k), repeat=2): c = A_overlap_count[i + x[0], j + x[1]] A_recons[i + x[0], j + x[1], :] = (c * A_recons[i + x[0], j + x[1], :] + patch_recons[x[0], x[1], :]) / (c + 1) A_overlap_count[i + x[0], j + x[1]] += 1 # progress status print('reconstructing (%i, %i)th patch out of (%i, %i)' % (i / recons_resolution, j / recons_resolution, num_rows, num_cols)) print('Reconstructed in %.2f seconds' % (time() - t0)) print('A_recons.shape', A_recons.shape) if if_save: np.save('Video_dictionary/video_recons_color', A_recons) plt.imshow(A_recons) return A_recons
def parallel_sc(args): (dico, p) = args coder = SparseCoder(dictionary=dico, transform_algorithm='omp') #by default, number of non zero coefficients is 0.1 * n_features #Zeyde et al.: 3 non zero coefficients ! code = coder.transform(p).astype(np.float32) return code
def code1(self, data, max_iter=None, errors=False): ''' Sparse codes a single feature Requires that the dictionary is already trained ''' if self.codebook is None: self.codebook = SparseCoder(self.codebook_comps.T,n_jobs=4) return self.codebook.transform(data.reshape(1,-1)).ravel()
def test_sparse_coder_dtype_match(data_type, transform_algorithm): # Verify preserving dtype for transform in sparse coder n_components = 6 rng = np.random.RandomState(0) dictionary = rng.randn(n_components, n_features) coder = SparseCoder(dictionary.astype(data_type), transform_algorithm=transform_algorithm) code = coder.transform(X.astype(data_type)) assert code.dtype == data_type
def sparse_code_proximal(self, X, W, a1, a2): ''' Given data matrix X and dictionary matrix W, find code matrix H and noise matrix S such that H, S = argmin ||X - WH - S||_{F}^2 + \alpha ||H||_{1} + \beta ||S||_{1} Uses proximal gradient G = [H \\ S'] V = [W, b I] (so that VG = WH + bS') Then solve min_{G,V} |X-VG|_{F} + \alpha |G|_{1} = = min_{H,S'} |X - HW - bS'|_{F}^2 + \alpha |H|_{1} + \alpha |S'|_{1} = min_{H,S} |X - HW - S|_{F}^2 + \alpha |H|_{1} + (\alpha/b)|S|_{1} using constrained LASSO args: X (numpy array): data matrix with dimensions: features (d) x samples (n) W (numpy array): dictionary matrix with dimensions: features (d) x topics (r) returns: H (numpy array): code matrix with dimensions: topics (r) x samples(n) S (numpy array): noise matrix with dimensions: features (d) x samples (n) ''' if DEBUG: print('sparse_code') print('X.shape:', X.shape) print('W.shape:', W.shape, '\n') # initialize the SparseCoder with W as its dictionary # H_new = LASSO with W as dictionary # S_new = LASSO with id (d x d) as dictionary # Y_new = Y + (W H_new + S_new - S) : Dual variable ### Initialization d, n = X.shape r = self.n_components ### Augmented dictionary matrix for proximal gradient V = np.hstack((W, a2*np.identity(d))) ### Proximal sparse coding by constrained LASSO coder = SparseCoder(dictionary=V.T, transform_n_nonzero_coefs=None, transform_alpha=a1, transform_algorithm='lasso_lars', positive_code=True) G = coder.transform(X.T) G = G.T # transpose G before returning to undo the preceding transpose on X ### Read off H and S from V H = G[0:r, :] S = a2*G[r:, :] return H, S
def sc_result_analysis(): """ 对稀疏编码的结果进行分析i :return: """ sc_file = open('./tmp_file/30_dictionary.pickle', 'rb') sc_list = cPickle.load(sc_file) classified_file = open('./tmp_file/30_class_result.pickle', 'rb') (classified_feature, classified_patch) = cPickle.load(classified_file) model_file = open('./tmp_file/30_kmeans_pca_model.pickle', 'rb') (k_means, pca) = cPickle.load(model_file) sc_file.close() classified_file.close() model_file.close() # ======================================================================== for i in range(5): k = i #v_feature = pca.transform(classified_feature[1][k]).reshape((-1,)) v_feature = classified_feature[3][k] v_patch = classified_patch[3][k] feature_dict = sc_list[0][:, :144] patch_dict = sc_list[0][:, 144:] #v_feature = feature_dict[0] #v_patch = patch_dict[0] coder = SparseCoder(dictionary=feature_dict, transform_algorithm='omp', transform_alpha=0.01, n_jobs=2, transform_n_nonzero_coefs=1) weight = coder.transform(v_feature) v_patch = v_patch.reshape((9, 9)) result = np.dot(weight, patch_dict).reshape((9, 9)) mask = weight != 0 print weight[mask] mask = mask[0] print len(patch_dict[mask]) print len(patch_dict[mask]) patch_show(patch_dict[mask],[0,0,0.45,0.45],1) ax2 = plt.axes([0, 0.5, 0.45, 0.45]) ax2.imshow(result, interpolation="none", cmap=cm.gray) ax2 = plt.axes([0.5, 0.5, 0.45, 0.45]) ax2.imshow(v_patch, interpolation="none", cmap=cm.gray) plt.show()
def train(DWA_all, D_all, W_all, A_all, Cs, labels, file_paths, inds_of_file_path, train_number, start_init_number, update_times, update_index, n_classes, n_atoms, n_features, lambda_init, the_lambda, transform_n_nonzero_coefs, omp_tag): for j in range(n_classes): if j == 0: print(update_index) sys.stdout.flush() coder = SparseCoder( dictionary=D_all.T, transform_n_nonzero_coefs=transform_n_nonzero_coefs, transform_algorithm='omp') label_indexs_for_update = inds_of_file_path[j][:train_number] new_index = [ label_indexs_for_update[(update_index + start_init_number) % train_number] ] new_label = labels[new_index][0] lab_index = j im_vec = load_img(file_paths[new_index][0]) im_vec = im_vec / 255. new_y = np.array(im_vec, dtype=float) new_y = preprocessing.normalize(new_y.T, norm='l2').T new_y = norm_Ys(new_y) new_y = new_y.reshape(n_features, 1) new_h = np.zeros((n_classes, 1)) new_h[lab_index, 0] = 1 new_q = np.zeros((n_atoms * n_classes, 1)) new_q[n_atoms * lab_index:n_atoms * (lab_index + 1), 0] = 1 new_yhq = np.vstack((new_y, new_h, new_q)) new_x = None if omp_tag == "true": new_x = (coder.transform(new_y.T)).T if omp_tag == "wzz": new_x = transform(D_all, new_y, transform_n_nonzero_coefs) the_C = Cs the_u = (1 / the_lambda) * np.dot(the_C, new_x) gamma = 1 / (1 + np.dot(new_x.T, the_u)) the_r = new_yhq - np.dot(DWA_all, new_x) new_C = (1 / the_lambda) * the_C - gamma * np.dot(the_u, the_u.T) new_DWA = DWA_all + gamma * np.dot(the_r, the_u.T) DWA_all = new_DWA part_lambda = (1 - update_index / update_times) the_lambda = 1 - (1 - lambda_init) * part_lambda * part_lambda * part_lambda D_all = DWA_all[0:D_all.shape[0], :] W_all = DWA_all[D_all.shape[0]:D_all.shape[0] + W_all.shape[0], :] A_all = DWA_all[D_all.shape[0] + W_all.shape[0]:, :] D_all = preprocessing.normalize(D_all.T, norm='l2').T W_all = preprocessing.normalize(W_all.T, norm='l2').T A_all = preprocessing.normalize(A_all.T, norm='l2').T DWA_all = np.vstack((D_all, W_all, A_all)) return DWA_all, D_all, W_all, A_all, the_lambda
def predict_svm(self, img): gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.resize(gray, (100,100)) gray = np.array(gray).reshape(1,100*100) #X = self.pca.transform(gray) D = np.array(self.pca.components_) dictionary = SparseCoder(D, transform_algorithm='omp', transform_n_nonzero_coefs=10, transform_alpha=None) features = dictionary.transform(gray) label = self.svm.predict(features) score = None return label, score
def sparse_codifier(y,D, transform_algo = 'omp', transform_n_nonzero_coefs = 2): """ Encodes an input vector y into an output sparse vector x, based on a given dictionary D Retired, to delete. """ print('y.shape = %s ' % (y.shape,)) print('Dictionary shape = %s' % (D.shape,)) coder = SparseCoder(dictionary = D, transform_algorithm = transform_algo) x = coder.transform(y) print('x.shape = %s' % (x.shape,)) return x
def sparse_codifier(y, D, transform_algo='omp', transform_n_nonzero_coefs=2): """ Encodes an input vector y into an output sparse vector x, based on a given dictionary D Retired, to delete. """ print('y.shape = %s ' % (y.shape, )) print('Dictionary shape = %s' % (D.shape, )) coder = SparseCoder(dictionary=D, transform_algorithm=transform_algo) x = coder.transform(y) print('x.shape = %s' % (x.shape, )) return x
def reconstruct_image_color(self, path, recons_resolution=1): print('reconstructing given network...') ''' Note: For WAN data, the algorithm reconstructs the normalized WAN matrix A/np.max(A). Scale the reconstructed matrix B by np.max(A) and compare with the original network. ''' A = self.read_img_as_array(path) # A.shape = (row, col, 3) A_matrix = A.reshape(-1, A.shape[1]) # (row, col, 3) --> (3row, col) [m, n] = A_matrix.shape A_recons = np.zeros(shape=A.shape) A_overlap_count = np.zeros(shape=(A.shape[0], A.shape[1])) k = self.patch_size t0 = time() c = 0 num_rows = np.floor( (A_recons.shape[0] - k) / recons_resolution).astype(int) num_cols = np.floor( (A_recons.shape[1] - k) / recons_resolution).astype(int) for i in np.arange(0, A_recons.shape[0] - k, recons_resolution): for j in np.arange(0, A_recons.shape[1] - k, recons_resolution): patch = A[i:i + k, j:j + k, :] patch = patch.reshape((-1, 1)) # print('patch.shape', patch.shape) coder = SparseCoder(dictionary=self.W.T, transform_n_nonzero_coefs=None, transform_alpha=1, transform_algorithm='lasso_lars', positive_code=True) # alpha = L1 regularization parameter. alpha=2 makes all codes zero (why?) code = coder.transform(patch.T) patch_recons = np.dot(self.W, code.T).T patch_recons = patch_recons.reshape(k, k, 3) # now paint the reconstruction canvas for x in itertools.product(np.arange(k), repeat=2): c = A_overlap_count[i + x[0], j + x[1]] A_recons[i + x[0], j + x[1], :] = (c * A_recons[i + x[0], j + x[1], :] + patch_recons[x[0], x[1], :]) / (c + 1) A_overlap_count[i + x[0], j + x[1]] += 1 # progress status print('reconstructing (%i, %i)th patch out of (%i, %i)' % (i / recons_resolution, j / recons_resolution, num_rows, num_cols)) print('Reconstructed in %.2f seconds' % (time() - t0)) print('A_recons.shape', A_recons.shape) np.save('Image_dictionary/img_recons_color', A_recons) plt.imshow(A_recons) return A_recons
def __sparse_encode(self, D, test_X): """ Z.shape = (test_size, atoms_size=dict_size) """ coder = SparseCoder(dictionary=D, transform_algorithm='lasso_cd', transform_alpha=self.coder_alpha) coder.fit(test_X) Z = coder.transform(test_X) return Z
def reconstruct_network(self, path, recons_iter=100): print('reconstructing given network...') ''' Note: For WAN data, the algorithm reconstructs the normalized WAN matrix A/np.max(A). Scale the reconstructed matrix B by np.max(A) and compare with the original network. ''' A = self.A [N, N] = A.shape A_recons = np.zeros(shape=(N, N)) A_overlap_count = np.zeros(shape=(N, N)) B = self.path_adj(self.k1, self.k2) k = self.k1 + self.k2 + 1 # size of the network patch x0 = np.random.choice(np.arange(0, N)) emb = self.tree_sample(B, x0) t0 = time() c = 0 for t in np.arange(recons_iter): patch, emb = self.get_single_patch_glauber(B, emb) coder = SparseCoder(dictionary=self.W.T, transform_n_nonzero_coefs=None, transform_alpha=0, transform_algorithm='lasso_lars', positive_code=True) # alpha = L1 regularization parameter. alpha=2 makes all codes zero (why?) # This only occurs when sparse coding a single array code = coder.transform(patch.T) patch_recons = np.dot(self.W, code.T).T patch_recons = patch_recons.reshape(k, k) for x in itertools.product(np.arange(k), repeat=2): a = emb[x[0]] b = emb[x[1]] j = A_overlap_count[a, b] A_recons[a, b] = (j * A_recons[a, b] + patch_recons[x[0], x[1]]) / (j + 1) # A_recons[a,b] = A_recons[a,b] + patch_recons[x[0], x[1]] A_overlap_count[a, b] += 1 # progress status if 100 * t / recons_iter % 1 == 0: print(t / recons_iter * 100) print('Reconstructed in %.2f seconds' % (time() - t0)) np.save( 'Network_dictionary/WAN/twain_recons' + "_" + str(self.k1) + str(self.k2) + "_" + str(self.n_components), A_recons) return A_recons
def F(self, x, B, A=None): ''' Calculates a gradient-derived matrix (A-prime) based on a known dictionary and a known signal matrix Parameters ----------- x : numpy array: base data used for reconstruction B : numpy array: Dictionary A : numpy array: Singal matrix Return ------ acts: numpy array: derived matrix (A-prime) ''' # 4b B = np.asarray(B) A = np.asarray(A) coder = SparseCoder(dictionary=B.T, transform_alpha=self.rp, transform_algorithm='lasso_cd') comps, acts = librosa.decompose.decompose(x, transformer=coder) acts = self._pos_constraint(acts) return acts
def generateWalk2vecSC(D, times, n, vs, l): coder = SparseCoder(dictionary=D.components_, transform_alpha=l) path = input("Enter filename to save datasets...\n") with open(path, 'w') as f: try: for i in range(times): v = coder.transform(vs[i * n:(i + 1) * n]) v = np.average(v, axis=0) f.write(str(v.tolist()) + '\n') sys.stdout.write("\r{}/{} finished".format(i + 1, times)) except: os.remove(path) traceback.print_exc() exit(1) print("\rData successfully generated")
def test_sparse_coder_estimator_clone(): n_components = 12 rng = np.random.RandomState(0) V = rng.randn(n_components, n_features) # random init V /= np.sum(V**2, axis=1)[:, np.newaxis] coder = SparseCoder(dictionary=V, transform_algorithm='lasso_lars', transform_alpha=0.001) cloned = clone(coder) assert id(cloned) != id(coder) np.testing.assert_allclose(cloned.dictionary, coder.dictionary) assert id(cloned.dictionary) != id(coder.dictionary) assert cloned.n_components_ == coder.n_components_ assert cloned.n_features_in_ == coder.n_features_in_ data = np.random.rand(n_samples, n_features).astype(np.float32) np.testing.assert_allclose(cloned.transform(data), coder.transform(data))
def predict_joint_single(self, data, a1): k = self.patch_size L = self.prediction_length A = data # A.shape = (self.data.shape[0], k-L, self.data.shape[2]) # A_recons = np.zeros(shape=(A.shape[0], k, A.shape[2])) # W_tensor = self.W.reshape((k, A.shape[0], -1)) # print('A.shape', A.shape) W_tensor = self.W.reshape( (self.data.shape[0], k, self.data.shape[2], -1)) # print('W.shape', W_tensor.shape) # for missing data, not needed for the COVID-19 data set # extract only rows of nonnegative values (disregarding missing entries) (negative = N/A) J = np.where(np.min(A, axis=(0, 1)) >= -1) A_pos = A[:, :, J] # print('A_pos', A_pos) # print('np.min(A)', np.min(A)) W_tensor = W_tensor[:, :, J, :] W_trimmed = W_tensor[:, 0:k - L, :, :] W_trimmed = W_trimmed.reshape((-1, self.n_components)) patch = A_pos # print('patch', patch) patch = patch.reshape((-1, 1)) # print('patch.shape', patch.shape) # print('patch', patch) coder = SparseCoder(dictionary=W_trimmed.T, transform_n_nonzero_coefs=None, transform_alpha=a1, transform_algorithm='lasso_lars', positive_code=True) # alpha = L1 regularization parameter code = coder.transform(patch.T) patch_recons = np.dot( self.W, code.T).T # This gives prediction on the last L missing entries patch_recons = patch_recons.reshape(-1, k, A.shape[2]) # now paint the reconstruction canvas # only add the last predicted value A_recons = patch_recons[:, k - 1, :] return A_recons[:, np.newaxis, :]
def sparse_code_affine(self, X, W, a1, num_blocks): ''' Given data matrix X and dictionary matrix W, find code matrix H and affine translations for each blocks in X so that X \approx WH + block-translation. For the case when X has a single block, this is X \approx WH + bI. Use alternating optimization -- fix b, find H by LASSO; fix H, find b by MSE, which gives b = mean(X-WH). args: X (numpy array): data matrix with dimensions: features (d) x samples (n) W (numpy array): dictionary matrix with dimensions: features (d) x topics (r) returns: H (numpy array): code matrix with dimensions: topics (r) x samples(n) S (numpy array): noise matrix with dimensions: features (d) x samples (n). (d) rows are partitioned into "num_blocks" blocks, in which the entries of S are constant. ''' if DEBUG: print('sparse_code') print('X.shape:', X.shape) print('W.shape:', W.shape, '\n') # initialize the SparseCoder with W as its dictionary # H_new = LASSO with W as dictionary # S_new = LASSO with id (d x d) as dictionary # Y_new = Y + (W H_new + S_new - S) : Dual variable block_iter = 1 nb = num_blocks H = [] S = np.zeros(shape=X.shape) for step in np.arange(block_iter): ### Optimize H by constrained LASSO coder = SparseCoder(dictionary=W.T, transform_n_nonzero_coefs=None, transform_alpha=a1, transform_algorithm='lasso_lars', positive_code=True) H = coder.transform((X - S).T) H = H.T ### Optimiez S by solving MSE l = np.floor(X.shape[0]/nb).astype(int) for i in np.arange(l): Y = X - W @ H print('Y', np.sum(Y)) print('S', np.sum(S)) S[nb*i : nb*(i+1), 0] = np.mean(Y[nb*i : nb*(i+1), 0]) # solution to block-MSE S[nb * l:, 0] = np.mean((X - W @ H)[nb * l:, 0]) # solution to block-MSE return H, S
def test_sparse_coder_parallel_mmap(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/5956 # Test that SparseCoder does not error by passing reading only # arrays to child processes rng = np.random.RandomState(777) n_components, n_features = 40, 64 init_dict = rng.rand(n_components, n_features) # Ensure that `data` is >2M. Joblib memory maps arrays # if they are larger than 1MB. The 4 accounts for float32 # data type n_samples = int(2e6) // (4 * n_features) data = np.random.rand(n_samples, n_features).astype(np.float32) sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data)
def cluster_sk_sparse_coder(content): """ x """ _config = SparseCoder(n_components=content['n_components'], alpha=content['alpha'], ridge_alpha=content['ridge_alpha'], max_iter=content['max_iter'], tol=content['tol'], method=content['sk_method'], n_jobs=-1) _result = _config.fit_transform(content['data']) return httpWrapper( json.dumps({ 'result': _result.tolist(), 'components': _config.components_.tolist(), 'error': _config.error_, 'iter': _config.n_iter_ }))
def test_max_iter(): def ricker_function(resolution, center, width): """Discrete sub-sampled Ricker (Mexican hat) wavelet""" x = np.linspace(0, resolution - 1, resolution) x = ( (2 / (np.sqrt(3 * width) * np.pi**0.25)) * (1 - (x - center) ** 2 / width**2) * np.exp(-((x - center) ** 2) / (2 * width**2)) ) return x def ricker_matrix(width, resolution, n_components): """Dictionary of Ricker (Mexican hat) wavelets""" centers = np.linspace(0, resolution - 1, n_components) D = np.empty((n_components, resolution)) for i, center in enumerate(centers): D[i] = ricker_function(resolution, center, width) D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis] return D transform_algorithm = "lasso_cd" resolution = 1024 subsampling = 3 # subsampling factor n_components = resolution // subsampling # Compute a wavelet dictionary D_multi = np.r_[ tuple( ricker_matrix( width=w, resolution=resolution, n_components=n_components // 5 ) for w in (10, 50, 100, 500, 1000) ) ] X = np.linspace(0, resolution - 1, resolution) first_quarter = X < resolution / 4 X[first_quarter] = 3.0 X[np.logical_not(first_quarter)] = -1.0 X = X.reshape(1, -1) # check that the underlying model fails to converge with pytest.warns(ConvergenceWarning): model = SparseCoder( D_multi, transform_algorithm=transform_algorithm, transform_max_iter=1 ) model.fit_transform(X) # check that the underlying model converges w/o warnings with pytest.warns(None) as record: model = SparseCoder( D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000 ) model.fit_transform(X) assert not [w.message for w in record]
def genD_X(p): Site = p[0] Y = p[1][0] D = p[1][1] n_nonzero = 1 algo = 'omp' #Y_mod=np.reshape(Y, (1,N)) Y_mod = Y.T D_mod = D.T #D = np.random.randn(K,N) coder = SparseCoder(dictionary=D_mod, transform_n_nonzero_coefs=n_nonzero, transform_alpha=None, transform_algorithm=algo) X = coder.transform(Y_mod) X_mod = X.T #X-returned has shape (k,s) #shape of Y is (n,s) while D is (n,k) return (Site, (Y,D,X_mod))
def slp_train_svm(folder, output): path = str.format('{0}/*', folder) data = [] cls = [] c = 0 for folder in glob.glob(path): for img_path in glob.glob(folder + '/*.jpg'): print 'Loading: ', img_path img = cv2.imread(img_path, 0) img = np.array(img).reshape(img.shape[0]*img.shape[1]) data.append(img) cls.append(c) c = c + 1 data = np.array(data) pca = PCA(n_components=10) pca.fit(data) D = np.array(pca.components_) dictionary = SparseCoder(D, transform_algorithm='omp', transform_n_nonzero_coefs=10, transform_alpha=None) features = dictionary.transform(data) svm = LinearSVC() svm.fit(features, cls) pickle.dump(svm, open(output, 'wb'))
def fit(self, X, tuple_size=3, n_tuples=692, weights=True, expand=True, thres=0.7, cutoff=3): """ Discovers topics and used them as a dictionary for sparse-coding. """ models = self.discover_topics(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand, thres=thres, cutoff=cutoff) self.coder = SparseCoder(dictionary=normalize(models.toarray()), transform_algorithm='lasso_lars', split_sign=True, n_jobs=4)
class SMHClassifier(BaseEstimator): """ SMH-based classifier. """ def __init__(self, tuple_size=3, n_tuples=692, wcc=None, ovr_thres=0.7): self.tuple_size = tuple_size if wcc: self.wcc = wcc self.n_tuples = log(0.5) / log(1.0 - pow(wcc, tuple_size)) else: self.n_tuples = n_tuples def discover_topics(self, X, tuple_size=3, n_tuples=692, weights=True, expand=True, thres=0.7, cutoff=3): """ Discovers topics from a text corpus. """ ifs = array_to_listdb(X) mined = ifs.mine(tuple_size=tuple_size, num_tuples=n_tuples, weights=weights, expand=expand) mined.cutoff(min=cutoff) models = mined.cluster_mhlink(thres=thres) return models def fit(self, X, tuple_size=3, n_tuples=692, weights=True, expand=True, thres=0.7, cutoff=3): """ Discovers topics and used them as a dictionary for sparse-coding. """ models = self.discover_topics(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand, thres=thres, cutoff=cutoff) self.coder = SparseCoder(dictionary=normalize(models.toarray()), transform_algorithm='lasso_lars', split_sign=True, n_jobs=4) def fit_transform(self, X, tuple_size=3, n_tuples=692, weights=None, expand=None, thres=0.7, cutoff=3): """ Discovers topics and used them as a dictionary to sparse-code the documents. """ self.fit(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand, thres=thres, cutoff=cutoff) return self.coder.fit_transform(X.todense()) def transform(self, X): """ Sparse-code a given set of documents from the discovered topics. """ return self.coder.transform(X.todense())
def scskl_reconstruction(data,mask,D): output = np.zeros(data.shape) fmap = np.zeros((D.shape[0])) #fdata = np.zeros((data.shape[0],data.shape[1],data.shape[2],D.shape[0])) px = np.int(np.around(np.power(D.shape[1],1/3))) #patch size (is assumed to be isotropic) hpx = np.floor(px/2).astype(int) nblock = 2 # number of block per dimension subsize = np.ceil(np.array(data.shape) / nblock).astype(int) med = np.median(data) currentblock = 1 for x in range(np.ceil(data.shape[0]/subsize[0]).astype(int)): xmin = x*subsize[0] xmax = np.min((data.shape[0],(x+1)*subsize[0])) for y in range(np.ceil(data.shape[1]/subsize[1]).astype(int)): ymin = y*subsize[1] ymax = np.min((data.shape[1],(y+1)*subsize[1])) for z in range(np.ceil(data.shape[2]/subsize[2]).astype(int)): zmin = z*subsize[2] zmax = np.min((data.shape[2],(z+1)*subsize[2])) print('Processing block : ',currentblock) currentblock+=1 #Enlarge subimage to take into account block effect due to non-overlapping patches xmin2 = np.max((0,xmin-hpx)) xmax2 = np.min((data.shape[0],xmax+hpx)) ymin2 = np.max((0,ymin-hpx)) ymax2 = np.min((data.shape[1],ymax+hpx)) zmin2 = np.max((0,zmin-hpx)) zmax2 = np.min((data.shape[2],zmax+hpx)) subdata = data[xmin2:xmax2,ymin2:ymax2,zmin2:zmax2] submask = mask[xmin2:xmax2,ymin2:ymax2,zmin2:zmax2] p = mp.array_to_patches(subdata,patch_shape=(px,px,px),normalization=False) pm = mp.array_to_patches(submask,patch_shape=(px,px,px),normalization=False) #remove patch we dont want to process index = ~np.all(pm==0,axis=1) subp = p[index] subp -= med if subp.shape[0] > 0: print('Number of patches to process: ',subp.shape[0]) #Currently, there is a bug when using n_jobs>1 (https://github.com/scikit-learn/scikit-learn/issues/5956) coder = SparseCoder(dictionary=D, transform_algorithm='omp') code = coder.transform(subp).astype(np.float32) fmap += np.sum((np.fabs(code)>0),axis=0) subp = np.dot(code, D) subp += med p[index] = subp suboutput = mp.patches_to_array(patches=p, patch_shape=(px,px,px), array_shape=subdata.shape) tmpoutput = np.empty(data.shape) tmpoutput[xmin2:xmax2,ymin2:ymax2,zmin2:zmax2]= suboutput output[xmin:xmax,ymin:ymax,zmin:zmax] = tmpoutput[xmin:xmax,ymin:ymax,zmin:zmax] # for a in range(D.shape[0]): # for s in range(subp.shape[0]): # subp[s,:] = code[s,a] # p.fill(0) # p[index] = subp # fa = mp.patches_to_array(patches=p, patch_shape=(px,px,px), array_shape=subdata.shape) # to = np.empty(data.shape) # to[xmin2:xmax2,ymin2:ymax2,zmin2:zmax2]= fa # fdata[xmin:xmax,ymin:ymax,zmin:zmax,a] = to[xmin:xmax,ymin:ymax,zmin:zmax] #plt.bar(range(0,D.shape[0]), fmap) #print('points in mask: ',np.sum(mask!=0)) #print('Number of non zero elements: ',np.sum(fmap)/np.sum(mask!=0)) #plt.show() # return (output,fdata) return output
def __init__( self, which_set, frame_length, overlap=0.5, frames_per_example=1, start=0, stop=None, audio_only=True, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=64, n_channels=64, ): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length if overlap < 1.0: self.overlap = overlap * frame_length else: self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels print "Frame length %d, overlap %d" % (self.frame_length, self.overlap) # Initializing the dictionary self.D = numpy.r_[ tuple( gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels) ) ] print "Using dictionary with shape", self.D.shape self.coder = SparseCoder( dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp" ) # RNG initialization if hasattr(rng, "random_integers"): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): print "Sentence %d/%d" % (sequence_id, len(self.raw_wav)) X = segment_axis(samples_sequence, frame_length, overlap, end="pad") X = numpy.hanning(self.frame_length) * X self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X)) # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences) self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example) features_source = "features" def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append( self.samples_sequences[sequence_index][ example_index : example_index + self.frames_per_example ].todense() ) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = "targets" def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))
from scipy.io import wavfile from scikits.talkbox import segment_axis resolution = 160 step = 8 b = 1.019 n_channels = 64 overlap = 80 # Compute a multiscale dictionary D_multi = np.r_[tuple(gammatone_matrix(b, fc, resolution, step) for fc in erb_space(150, 8000, n_channels))] # Load test signal fs, y = wavfile.read('/home/jfsantos/data/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV') y = y / 2.0**15 Y = segment_axis(y, resolution, overlap=overlap, end='pad') Y = np.hanning(resolution) * Y # segments should be windowed and overlap coder = SparseCoder(dictionary=D_multi, transform_n_nonzero_coefs=None, transform_alpha=1., transform_algorithm='omp') X = coder.transform(Y) density = len(np.flatnonzero(X)) out= np.zeros((np.ceil(len(y)/resolution)+1)*resolution) for k in range(0, len(X)): idx = range(k*(resolution-overlap),k*(resolution-overlap) + resolution) out[idx] += np.dot(X[k], D_multi) squared_error = np.sum((y - out[0:len(y)]) ** 2) wavfile.write('reconst_%d_%d.wav'%(resolution,overlap), fs, np.asarray(out, dtype=np.float32))
class TIMITSparseGenerator(Dataset): """ Frame-based TIMIT dataset """ _default_seed = (17, 2, 946) # Mean and standard deviation of the acoustic samples from the whole # dataset (train, valid, test). _mean = 0.0035805809921434142 _std = 542.48824133746177 def __init__( self, which_set, frame_length, overlap=0.5, frames_per_example=1, start=0, stop=None, audio_only=True, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, b=1.019, step=64, n_channels=64, ): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length if overlap < 1.0: self.overlap = overlap * frame_length else: self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.b = b self.step = step self.n_channels = n_channels print "Frame length %d, overlap %d" % (self.frame_length, self.overlap) # Initializing the dictionary self.D = numpy.r_[ tuple( gammatone_matrix(self.b, fc, self.frame_length, self.step) for fc in erb_space(150, 8000, self.n_channels) ) ] print "Using dictionary with shape", self.D.shape self.coder = SparseCoder( dictionary=self.D, transform_n_nonzero_coefs=None, transform_alpha=None, transform_algorithm="omp" ) # RNG initialization if hasattr(rng, "random_integers"): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): print "Sentence %d/%d" % (sequence_id, len(self.raw_wav)) X = segment_axis(samples_sequence, frame_length, overlap, end="pad") X = numpy.hanning(self.frame_length) * X self.raw_wav[sequence_id] = scipy.sparse.csr_matrix(self.coder.transform(X)) # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = self.raw_wav[sequence_id].shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav numpy.save("%s_sparse_frames.npy" % which_set, self.samples_sequences) self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.D.shape[0] * self.frames_per_example) features_source = "features" def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append( self.samples_sequences[sequence_index][ example_index : example_index + self.frames_per_example ].todense() ) return rval targets_space = VectorSpace(dim=self.D.shape[0]) targets_source = "targets" def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example].todense()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source)) def _fetch_index(self, indexes): digit = numpy.digitize(indexes, self.cumulative_example_indexes) - 1 return zip(digit, numpy.array(indexes) - self.cumulative_example_indexes[digit]) def _load_data(self, which_set): """ Load the TIMIT data from disk. Parameters ---------- which_set : str Subset of the dataset to use (either "train", "valid" or "test") """ # Check which_set if which_set not in ["train", "valid", "test"]: raise ValueError( which_set + " is not a recognized value. " + "Valid values are ['train', 'valid', 'test']." ) # Create file paths timit_base_path = os.path.join(os.environ["PYLEARN2_DATA_PATH"], "timit/readable") raw_wav_path = os.path.join(timit_base_path, which_set + "_x_raw.npy") # Load data. For now most of it is not used, as only the acoustic # samples are provided, but this is bound to change eventually. # Set-related data self.raw_wav = serial.load(raw_wav_path) # self.scaler = serial.load(scaler_path) def _validate_source(self, source): """ Verify that all sources in the source tuple are provided by the dataset. Raise an error if some requested source is not available. Parameters ---------- source : `tuple` of `str` Requested sources """ for s in source: try: self.data_specs[1].index(s) except ValueError: raise ValueError("the requested source named '" + s + "' " + "is not provided by the dataset") def get_data_specs(self): """ Returns the data_specs specifying how the data is internally stored. This is the format the data returned by `self.get_data()` will be. .. note:: Once again, this is very hacky, as the data is not stored that way internally. However, the data that's returned by `TIMIT.get()` _does_ respect those data specs. """ return self.data_specs def get(self, source, indexes): """ .. todo:: WRITEME """ if type(indexes) is slice: indexes = numpy.arange(indexes.start, indexes.stop) self._validate_source(source) rval = [] for so in source: batch = self.map_functions[self.data_specs[1].index(so)](indexes) batch_buffer = self.batch_buffers[self.data_specs[1].index(so)] dim = self.data_specs[0].components[self.data_specs[1].index(so)].dim if batch_buffer is None or batch_buffer.shape != (len(batch), dim): batch_buffer = numpy.zeros((len(batch), dim), dtype=batch[0].dtype) for i, example in enumerate(batch): batch_buffer[i] = example rval.append(batch_buffer) return tuple(rval) @functools.wraps(Dataset.iterator) def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): convert.append(None) # TODO: Refactor if mode is None: if hasattr(self, "_iter_subset_class"): mode = self._iter_subset_class else: raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, "_iter_batch_size", None) if num_batches is None: num_batches = getattr(self, "_iter_num_batches", None) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator( self, mode(self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert, )
# List the different sparse coding methods in the following format: # (title, transform_algorithm, transform_alpha, transform_n_nozero_coefs) estimators = [('OMP', 'omp', None, 15, 'navy'), ('Lasso', 'lasso_cd', 2, None, 'turquoise'), ] lw = 2 plt.figure(figsize=(13, 6)) for subplot, (D, title) in enumerate(zip((D_fixed, D_multi), ('fixed width', 'multiple widths'))): plt.subplot(1, 2, subplot + 1) plt.title('Sparse coding against %s dictionary' % title) plt.plot(y, lw=lw, linestyle='--', label='Original signal') # Do a wavelet approximation for title, algo, alpha, n_nonzero, color in estimators: coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero, transform_alpha=alpha, transform_algorithm=algo) x = coder.transform(y.reshape(1, -1)) density = len(np.flatnonzero(x)) x = np.ravel(np.dot(x, D)) squared_error = np.sum((y - x) ** 2) plt.plot(x, color=color, lw=lw, label='%s: %s nonzero coefs,\n%.2f error' % (title, density, squared_error)) # Soft thresholding debiasing coder = SparseCoder(dictionary=D, transform_algorithm='threshold', transform_alpha=20) x = coder.transform(y.reshape(1, -1)) _, idx = np.where(x != 0) x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T, y) x = np.ravel(np.dot(x, D))
def encode_kmeans_sparsecode(df, km, algo='lasso_cd', alpha=1, split=False): centroids = km.cluster_centers_ D = [centroids[i]/np.linalg.norm(centroids[i]) for i in range(len(centroids))] D = np.array(D) sc = SparseCoder(D, transform_algorithm=algo, transform_alpha=alpha, split_sign=split) return pd.DataFrame(sc.transform(df))
# List the different sparse coding methods in the following format: # (title, transform_algorithm, transform_alpha, transform_n_nozero_coefs) estimators = [('OMP', 'omp', None, 15, 'navy'), ('Lasso', 'lasso_cd', 2, None, 'turquoise'), ] lw = 2 plt.figure(figsize=(13, 6)) for subplot, (D, title) in enumerate(zip((D_fixed, D_multi), ('fixed width', 'multiple widths'))): plt.subplot(1, 2, subplot + 1) plt.title('Sparse coding against %s dictionary' % title) plt.plot(y, lw=lw, linestyle='--', label='Original signal') # Do a wavelet approximation for title, algo, alpha, n_nonzero, color in estimators: coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero, transform_alpha=alpha, transform_algorithm=algo) x = coder.transform(y) density = len(np.flatnonzero(x)) x = np.ravel(np.dot(x, D)) squared_error = np.sum((y - x) ** 2) plt.plot(x, color=color, lw=lw, label='%s: %s nonzero coefs,\n%.2f error' % (title, density, squared_error)) # Soft thresholding debiasing coder = SparseCoder(dictionary=D, transform_algorithm='threshold', transform_alpha=20) x = coder.transform(y) _, idx = np.where(x != 0) x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T, y) x = np.ravel(np.dot(x, D))
def getSparseCodes(dataset,Dict): print Dict.shape print dataset.shape coder=SparseCoder(Dict,transform_algorithm='lasso_lars') return coder.transform(dataset)
class SparseCoding(object): def __init__(self, log_lev='INFO', sparse_dim_rat=None, name='', dist_beta=0.1, dist_sigma=0.005, display=0): LOG.setLevel(log_lev) self.name = name self.codebook_comps = None self.active_set = None self.min_coeff = max([1, co.CONST['sparse_fss_min_coeff']]) self.min_coeff_rat = co.CONST['sparse_fss_min_coeff_rat'] self.gamma = co.CONST['sparse_fss_gamma'] self.rat = None if isinstance(self.gamma, str): if self.gamma.starts_with('var'): try: self.rat = [float(s) for s in str.split() if co.type_conv.isfloat(s)][0] except IndexError: self.rat = None self.inp_features = None self.sparse_features = None self.basis_constraint = 1 self.inv_codebook_comps = None self.res_codebook_comps = None self.max_iter = 500 self.dict_max_iter = 300 self.display = display self.prev_err = 0 self.curr_error = 0 self.allow_big_vals = False self.sparse_dim_rat = sparse_dim_rat if sparse_dim_rat is None: self.sparse_dim_rat = co.CONST['sparse_dim_rat'] self.theta = None self.prev_sparse_feats = None self.flush_flag = False self.sparse_feat_list = None self.inp_feat_list = None self.codebook = None self.time = [] def flush_variables(self): ''' Empty variables ''' self.active_set = None self.theta = None self.codebook_comps = None self.inp_features = None self.inp_feat_list = None self.sparse_features = None self.flush_flag = True self.res_codebook_comps = None self.prev_err = 0 self.curr_error = 0 self.lbds = 0.5*np.ones(self.sparse_dim) def initialize(self, feat_dim, init_codebook_comps=None): ''' Initialises B dictionary and s ''' self.sparse_dim = self.sparse_dim_rat * feat_dim if init_codebook_comps is not None: if (init_codebook_comps.shape[0] == feat_dim and init_codebook_comps.shape[1] == self.sparse_dim_rat * feat_dim): self.codebook_comps = init_codebook_comps.copy() else: raise Exception('Wrong input of initial B matrix, the dimensions' + ' should be ' + str(feat_dim) + 'x' + str(self.sparse_dim) + ', not ' + str(init_codebook_comps.shape[0]) + 'x' + str(init_codebook_comps.shape[1])) if (self.codebook_comps is None) or self.flush_flag: LOG.warning('Non existent codebook, manufactuning a random one') self.codebook_comps = random.random((feat_dim, self.sparse_dim)) if (self.sparse_features is None) or self.flush_flag: self.sparse_features = zeros((self.sparse_dim, 1)) self.theta = zeros(self.sparse_dim) self.active_set = zeros((self.sparse_dim), bool) self.sparse_features = zeros((self.sparse_dim, 1)) self.flush_flag = False self.is_trained = False def object_val_calc(self, codebook_comps, ksi, gamma, theta, vecs): ''' Calculate objective function value ''' _bs_ = np.dot(codebook_comps, vecs) square_term = 0.5 * npsum((ksi - _bs_)**2, axis=0) res = (square_term + gamma * dot(theta.T, vecs)).ravel() return res def feature_sign_search_algorithm(self, inp_features, acondtol=1e-3, ret_error=False, display_error=False, max_iter=0, single=False, timed=True, starting_points=None, training=False): ''' Returns sparse features representation ''' self.min_coeff_rat = co.CONST['sparse_fss_min_coeff_rat'] self.min_coeff = max([self.min_coeff, self.min_coeff_rat * np.size(inp_features)]) if self.inp_feat_list is not None: self.inp_feat_list.append(inp_features.ravel()) else: self.inp_feat_list = [inp_features.ravel()] self.inp_features = inp_features.copy().reshape((-1,1)) # Step 1 btb = dot(self.codebook_comps.T, self.codebook_comps) btf = dot(self.codebook_comps.T, self.inp_features) if self.rat is not None: self.gamma = np.max(np.abs(-2 * btf)) * self.rat gamma = self.gamma if starting_points is not None: self.sparse_features = starting_points.reshape((self.sparse_dim, 1)) self.theta = np.sign(self.sparse_features) self.active_set[:] = False self.active_set[self.sparse_features.ravel()!=0] = True step2 = 0 else: step2 = 1 count = 0 prev_objval = 0 if max_iter == 0: max_iter = self.max_iter else: self.max_iter = max_iter self.prev_sparse_feats = None prev_error = 0 initial_energy = compute_lineq_error(inp_features, 0, 0) interm_error = initial_energy SPLOG.info('Initial Signal Energy: ' + str(initial_energy)) SPLOG.info('Initial nonzero elements number: ' + str(np.sum(inp_features!=0))) converged = False for count in range(self.max_iter): # Step 2 if step2: zero_coeffs = (self.sparse_features == 0) qp_der_outfeati = 2 * \ (dot(btb, self.sparse_features) - btf) * zero_coeffs.reshape((-1,1)) i = argmax(npabs(qp_der_outfeati)) if (npabs(qp_der_outfeati[i]) > gamma or npsum(self.active_set) < self.min_coeff): self.theta[i] = -sign(qp_der_outfeati[i]) self.active_set[i] = True # Step 3 codebook_comps_h = self.codebook_comps[:, self.active_set] sparse_feat_h = self.sparse_features[self.active_set].reshape( (-1,1)) theta_h = self.theta[self.active_set].reshape((-1,1)) _q_ = dot(codebook_comps_h.T, self.inp_features) - gamma * theta_h / 2.0 codebook_comps_h2 = dot(codebook_comps_h.T, codebook_comps_h) rank = matrix_rank(codebook_comps_h2) zc_search = True if rank == codebook_comps_h2.shape[0]: new_sparse_f_h = np.linalg.solve(codebook_comps_h2, _q_) else: u,s,v = np.linalg.svd(codebook_comps_h2) col_space = u[:, :rank] null_space = u[:, rank:] #Check if q belongs in column space, ie the projection of #q in the column space is q itself q_proj = np.zeros_like(_q_).reshape(-1, 1) for i in range(col_space.shape[1]): col = col_space[:,i].reshape(-1, 1) q_proj+=((dot(_q_.reshape(1,-1),col) / np.dot(col.T, col).astype(float))*col) ''' LOG.info('q|Projection: ' + str(np.concatenate((_q_.reshape(-1,1),q_proj),axis=1))) LOG.info('Projection Energy: '+ str(np.sum(q_proj**2))) LOG.info('Distance between q and projection: '+str(np.linalg.norm(q_proj.ravel()-_q_.ravel()))) ''' if np.allclose(q_proj.ravel()-_q_.ravel(), 0, atol=1.e-6): new_sparse_f_h = dot(pinv(codebook_comps_h2),_q_) else: #direction z in nullspace of codebook_comps_h2 can not be #perpendicular to _q_, because then _q_ = C(codebook_comps_h2), #which was proven not to hold. #I take the principal vector that belongs in null_space of #codebook_comps_h2 and add it to the current sparse_feat_h #so that to search for zerocrossings #inside the line constructed # by this vector and sparse_feat_h, which has direction, # belonging to null_space of codebook_comps_h2 tmp_sparse_f_h = sparse_feat_h + dot(null_space, np.ones((null_space.shape[1],1))) zero_points_lin_par = sparse_feat_h / (sparse_feat_h - tmp_sparse_f_h).astype(float) # find _t_ that corresponds to the closest zero crossing to # sparse_feat_h _t_ind = np.argmin(np.abs(zero_points_lin_par[ np.isfinite(zero_points_lin_par)])) _t_ = zero_points_lin_par[ np.isfinite(zero_points_lin_par)][_t_ind] null_vec = _t_ * tmp_sparse_f_h + (1 - _t_) * sparse_feat_h new_sparse_f_h = null_vec zc_search = False if (np.prod(sign(sparse_feat_h) != sign(new_sparse_f_h)) and zc_search): zero_points_lin_par = sparse_feat_h / (sparse_feat_h - new_sparse_f_h).astype(float) zero_points_lin_par = concatenate((zero_points_lin_par[ ((zero_points_lin_par > 0) * (zero_points_lin_par < 1)).astype(bool)][:], array([1])), axis=0) _t_ = zero_points_lin_par null_vecs = _t_ * new_sparse_f_h + (1 - _t_) * sparse_feat_h objvals = self.object_val_calc(codebook_comps_h, self.inp_features, gamma, theta_h, null_vecs).flatten() objval_argmin = argmin(objvals) objval = np.min(objvals) new_sparse_f_h = null_vecs[:, objval_argmin][:, None].copy() else: objval = self.object_val_calc(codebook_comps_h, self.inp_features, gamma, theta_h, new_sparse_f_h) self.sparse_features[self.active_set] = new_sparse_f_h.copy() self.active_set[self.active_set] = np.logical_not( isclose(new_sparse_f_h, 0)) if npsum(self.active_set) < self.min_coeff: step2 = 1 continue self.theta = sign(self.sparse_features) # Step 4 nnz_coeff = self.sparse_features != 0 # a new_qp_der_outfeati = 2 * (dot(btb, self.sparse_features) - btf) cond_a = (new_qp_der_outfeati + gamma * sign(self.sparse_features)) * nnz_coeff ''' if np.abs(objval) - np.abs(prev_objval) > 100 and not\ self.allow_big_vals and not count == 0: if self.prev_sparse_feats is not None: SPLOG.info('Current Objective Function value: ' + str(np.abs(objval))) SPLOG.info('Previous Objective Function value: ' + str(np.abs(prev_objval))) SPLOG.info('Problem with big values of inv(B^T*B)' + ',you might want to increase atol' + ' or set flag allow_big_vals to true' + ' (this might cause' + ' problems)') SPLOG.info('Reverting to previous iteration result ' + 'and exiting loop..') self.sparse_features = self.prev_sparse_feats.ravel() break else: LOG.error('Current Objective Function value: ' + str(np.abs(objval))) LOG.error('Previous Objective Function value: ' + str(np.abs(prev_objval))) LOG.error('Problem with big values of inv(B^T*B),increase atol' + ' or set flag allow_big_vals to true (this might cause' + ' serious convergence problems)') LOG.error('Exiting as algorithm has not produced any' + ' output results.') exit() ''' prev_objval = objval self.prev_sparse_feats = self.sparse_features if allclose(cond_a, 0, atol=acondtol): # go to cond b: z_coeff = self.sparse_features == 0 cond_b = npabs(new_qp_der_outfeati * z_coeff) <= gamma if npsum(cond_b) == new_qp_der_outfeati.shape[0]: self.sparse_features = self.sparse_features.reshape((-1,1)) converged = True break else: # go to step 2 step2 = 1 else: # go to step 3 step2 = 0 if count % 10 == 0: interm_error = compute_lineq_error( self.inp_features, self.codebook_comps, self.sparse_features) if interm_error == prev_error or interm_error > initial_energy: converged=True break else: prev_error = interm_error SPLOG.info('\t Epoch:' + str(count)) SPLOG.info('\t\t Intermediate Error=' + str(interm_error)) if interm_error < 0.001: converged=True SPLOG.info('Small error, asssuming convergence') break ''' if initial_energy < interm_error: if not training: LOG.warning('FSS Algorithm did not converge, using pseudoinverse' + ' of provided codebook instead') if self.inv_codebook_comps is None: self.inv_codebook_comps = pinv(self.codebook_comps) self.sparse_features=dot(self.inv_codebook_comps,self.inp_features).ravel() else: SPLOG.info('FSS Algorithm did not converge,' + ' removing sample from training dataset...') self.sparse_features = None return (interm_error), False, initial_energy else: ''' if not converged: SPLOG.info('FSS Algorithm did not converge' + ' in the given iterations') else: SPLOG.info('Successful Convergence') SPLOG.info('\tFinal error: ' + str(interm_error)) SPLOG.info('\tNumber of nonzero elements: ' + str(np.sum(self.sparse_features!=0))) if not single: if self.sparse_feat_list is None: self.sparse_feat_list = [self.sparse_features.ravel()] else: self.sparse_feat_list.append(self.sparse_features.ravel()) if ret_error: return (compute_lineq_error(self.inp_features, self.codebook_comps, self.sparse_features), True, initial_energy) self.sparse_features = self.sparse_features.ravel() return None, True, None def lagrange_dual(self, lbds, ksi, _s_, basis_constraint): ''' Lagrange dual function for the minimization problem <ksi> is input, <_s_> is sparse, ''' lbds[lbds==0] = 10**(-18) #the drawback of this method self.ksist = dot(ksi, _s_.T) interm_result = inv( dot(_s_, _s_.T) + diag(lbds.ravel())) LOG.debug('Computed Lagrange Coefficients:\n'+str(np.unique(lbds))) res = ((dot(ksi.T,ksi)).trace() - (dot(dot(self.ksist, interm_result), self.ksist.T)).trace() - (basis_constraint * diag(lbds.ravel())).trace()) return -res # minimizing negative = maximizing positive def lagrange_dual_grad(self, lbds, ksi, _s_, basis_constraint): ''' Gradient of lagrange dual function, w.r.t. elf.codebook_comps, self.sparse_feat_list, self.are_sparsecoded_inp) = self.pickle.load(inp) s_ ''' # lbds=lbds.flatten() interm_result = inv( dot(_s_, _s_.T) + diag(lbds.ravel())) interm_result = dot(self.ksist, interm_result) interm_result = dot(interm_result.T,interm_result) res = diag(interm_result) - basis_constraint return -res # minimizing negative = maximizing positive def lagrange_dual_hess(self, lbds, ksi, _s_, basis_constraint): ''' It is not used, but it is here in case numpy solver gets also the hessian as input ''' interm_result = inv( dot(_s_, _s_.T) + diag(lbds.ravel())) interm_result1 = dot(self.ksist, interm_result) res = -2 * dot(interm_result1.T, interm_result1) * interm_result return -res #minimizing negative = maximizing positive # pylint: disable=no-member def conj_grad_dict_compute(self): ''' Function to train nxm matrix using truncated newton method ''' options = {'disp':True} ''' if self.res_codebook_comps is None: self.res_codebook_comps = self.codebook_comps LOG.info(self.res_codebook_comps.shape) ''' res = minimize(self.lagrange_dual, self.lbds.copy(), method='Newton-CG', jac=self.lagrange_dual_grad, #hess=self.lagrange_dual_hess, #bounds=np.array(([(10**(-18), 10**10)] * # self.sparse_feat_list.shape[0])), #stepmx=50.0, #maxCGit=20, #maxfun=100, options=options, #fmin=0.1, #ftol=0.1, #xtol=0.001, #rescale=1.5, args=(self.are_sparsecoded_inp.copy(), self.sparse_feat_list.copy(), self.basis_constraint) ) LOG.info(res) self.lbds = res.x LOG.info(np.unique(self.lbds)) interm_result = (self.lbds+ dot(self.sparse_feat_list, self.sparse_feat_list.T)) LOG.info(np.linalg.rank(interm_result)) codebook_comps = dot(inv(interm_result), self.ksist.T).T return codebook_comps # pylint: enable=no-member def train_sparse_dictionary(self, data, sp_opt_max_iter=200, init_traindata_num=200, incr_rate=2, min_iterations=3, init_codebook_comps=None, log_lev=None, n_jobs=4): if log_lev is not None: LOG.setLevel(log_lev) self.codebook_comps = DictionaryLearning( n_components=self.sparse_dim_rat * data.shape[1], alpha=co.CONST['sparse_alpha'], verbose=1, n_jobs=n_jobs).fit(data).components_.T @timeit def code1(self, data, max_iter=None, errors=False): ''' Sparse codes a single feature Requires that the dictionary is already trained ''' if self.codebook is None: self.codebook = SparseCoder(self.codebook_comps.T,n_jobs=4) return self.codebook.transform(data.reshape(1,-1)).ravel() def train_sparse_dictionary1(self, data, sp_opt_max_iter=200, init_traindata_num=200, incr_rate=2, min_iterations=3, init_codebook_comps=None, debug=False): ''' <data> is a numpy array, holding all the features(of single kind) that are required to train the sparse dictionary, with dimensions [n_features, n_samples]. The sparse dictionary is trained with a random subset of <data>, which is increasing in each iteration with rate <incr_rate> , along with the max iterations <sp_opt_max_iter> of feature sign search algorithm. <min_iterations> is the least number of iterations of the dictionary training, after total data is processed. ''' self.sparse_dim = min(data.shape) * self.sparse_dim_rat self.flush_variables() try: import progressbar except: LOG.warning('Install module progressbar2 to get informed about the' +' feature sign search algorithm progress') pass self.initialize(data.shape[0], init_codebook_comps=init_codebook_comps) iter_count = 0 retry_count = 0 LOG.info('Training dictionary: ' + self.name) LOG.info('Minimum Epochs number after total data is processed:' + str(min_iterations)) reached_traindata_num = False reached_traindata_count = 0 computed = data.shape[1] * [None] retry = False lar_approx = False while True: LOG.info('Epoch: ' + str(iter_count)) loaded = False self.sparse_feat_list = None self.inp_feat_list = None if debug and iter_count == 0: LOG.warning('Debug is on, loading data from first FSS execution') try: with open(self.name+' debug_sparse.pkl','r') as inp: (self.codebook_comps, self.sparse_feat_list, self.are_sparsecoded_inp) = pickle.load(inp) loaded=True except (IOError, EOFError): LOG.warning('Not existent '+self.name +' debug_sparse.pkl') if not loaded: train_num = min(int(init_traindata_num * (incr_rate) ** iter_count), data.shape[1]) if train_num == data.shape[1] and not reached_traindata_num: reached_traindata_num = True LOG.info('Total data is processed') if reached_traindata_num: reached_traindata_count += 1 LOG.info('Number of samples used: ' + str(train_num)) ran = rand.sample(xrange(data.shape[1]), train_num) feat_sign_max_iter = min(1000, sp_opt_max_iter * incr_rate ** iter_count) LOG.info('Feature Sign Search maximum iterations allowed:' + str(feat_sign_max_iter)) try: format_custom_text = progressbar.FormatCustomText( 'Mean Initial Error: %(mean_init_energy).4f,'+ ' Mean Final Error: %(mean).4f ,Valid Samples Ratio: %(valid).2f', dict( mean_init_energy=0, mean=0, valid=0 ), ) pbar = progressbar.ProgressBar(max_value=train_num - 1, redirect_stdout=True, widgets=[progressbar.widgets.Percentage(), progressbar.widgets.Bar(), format_custom_text]) errors=True sum_error = 0 sum_energy = 0 except UnboundLocalError: pbar = None errors = False pass are_sparsecoded = [] if pbar is not None: iterat = pbar(enumerate(ran)) else: iterat = enumerate(ran) for count, sample_count in iterat: fin_error, valid, init_energy = self.feature_sign_search_algorithm( data[:, sample_count], max_iter=feat_sign_max_iter, ret_error=errors,training=True, starting_points=computed[sample_count]) are_sparsecoded.append(True) try: if iter_count > 0 and valid: #do not trust first iteration sparse features, before #having trained the codebooks at least once computed[sample_count] = self.sparse_feat_list[-1] except (TypeError,AttributeError): pass if valid and pbar and errors: sum_error += fin_error mean_error = sum_error/float(sum(are_sparsecoded)) sum_energy += init_energy mean_init_energy = sum_energy/float(sum(are_sparsecoded)) if pbar is not None: format_custom_text.update_mapping(mean_init_energy= mean_init_energy, mean=mean_error, valid=sum(are_sparsecoded) /float(len(are_sparsecoded))) self.initialize(data.shape[0]) self.inp_feat_list = np.transpose(np.array(self.inp_feat_list)) self.sparse_feat_list = np.array(self.sparse_feat_list).T are_sparsecoded = np.array( are_sparsecoded).astype(bool) retry = np.sum(are_sparsecoded) < 1 / 3.0 * (are_sparsecoded).size self.are_sparsecoded_inp = self.inp_feat_list[:, are_sparsecoded] if debug and iter_count==0: LOG.warning('Debug is on, saving debug_sparse.pkl') with open(self.name + ' debug_sparse.pkl','w') as out: pickle.dump((self.codebook_comps, self.sparse_feat_list, self.are_sparsecoded_inp), out) prev_error = compute_lineq_error(self.are_sparsecoded_inp, self.codebook_comps, self.sparse_feat_list) if not lar_approx: dictionary = self.conj_grad_dict_compute() curr_error = compute_lineq_error( self.are_sparsecoded_inp, dictionary, self.sparse_feat_list) LOG.info('Reconstruction Error: ' + str(curr_error)) if loaded: mean_init_energy=0 mean_error = 0 if curr_error > prev_error or mean_error>1000 or retry or lar_approx: if (prev_error > 100 or mean_error>1000 or retry or lar_approx): if retry_count == 2 or lar_approx: if iter_count != 0: iter_count = 0 lar_approx = True init_traindata_num = data.shape[1] continue LOG.warning('Training has high final error but' + ' reached maximum retries. No codebook can' + ' be produced with the fast method,'+ ' using Lagrange Dual, as input'+ ' sparsecoded data S is' +' ill-conditioned (too low' + ' rank of the STS).'+ ' Least Angle Regression Method '+ ' will be used') self.codebook_comps = DictionaryLearning( self.sparse_dim, fit_algorithm='lars', code_init=self.inp_feat_list.T).fit( self.are_sparsecoded_inp.T).components_.T curr_error = compute_lineq_error( self.are_sparsecoded_inp, self.codebook_comps, self.sparse_feat_list) LOG.info('Reconstruction Error using LARS: ' + str(curr_error)) if curr_error > 1000: LOG.info('LARS method did not converge,' + ' no codebook is produced.') self.is_trained = False self.codebook_comps = None else: break LOG.warning('Training of codebook ' + self.name + ' completed with no success,'+ ' reinitializing (Retry:' + str(retry_count + 1) + ')') self.flush_variables() self.initialize(data.shape[0]) computed = data.shape[1] * [None] retry_count += 1 iter_count = -1 reached_traindata_count = 0 reached_traindata_num = False elif (np.isclose(prev_error,curr_error,atol=0.1) and reached_traindata_num and reached_traindata_count > min_iterations): break if curr_error < 0.5 and reached_traindata_num: break if (reached_traindata_num and reached_traindata_count > min_iterations and iter_count >= 0): break iter_count += 1 self.codebook_comps = dictionary self.inp_feat_list = None self.sparse_feat_list = None self.is_trained = True @timeit def code(self, data, max_iter=None, errors=False): ''' Sparse codes a single feature Requires that the dictionary is already trained ''' if max_iter is None: max_iter = co.CONST['sparse_fss_max_iter'] self.initialize(data.size) self.feature_sign_search_algorithm(data.ravel(), max_iter=max_iter, single=True, display_error=errors, ret_error=errors) return self.sparse_features def multicode(self, data, max_iter=None, errors=False): ''' Convenience method for sparsecoding multiple features. <data> is assumed to have dimensions [n_features, n_samples] output has dimensions [n_sparse, n_samples] ''' feat_dim = 0 for datum in data: if datum is not None: feat_dim = len(datum) if feat_dim == 0 : raise Exception('Bad Input, full of nans') sparse_features = np.zeros((len(data), self.sparse_dim_rat* feat_dim)) for count in range(len(data)): if data[count] is not None and np.prod(np.isfinite(data[count][:])): sparse_features[count, :] = self.code(data[count][:], max_iter, errors).ravel() else: sparse_features[count, :] = np.nan return sparse_features