def tryParameters((data_matrix, O, tol_perc, gamma)): logMsg("Trying tol=%f, gamma=%f" % (tol_perc, gamma)) L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=10) c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print([tol_perc, gamma, c_perc, num_pca_dimensions, n_iter]) return [tol_perc, gamma, c_perc, num_pca_dimensions, n_iter]
def tryParameters((data_matrix, O, tol_perc, gamma)): logMsg("Trying tol=%f, gamma=%f" % (tol_perc, gamma)) L, C, term, n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=10) c_vals = [(sum(square(C[:, i])) != 0) * 1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print([tol_perc, gamma, c_perc, num_pca_dimensions, n_iter]) return [tol_perc, gamma, c_perc, num_pca_dimensions, n_iter]
def compare_eps(vectors): data_matrix = column_stack(vectors) O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not gamma = .5 tol_perc = .002 tol_perc = .00000001 for e in [2,5,10,20,50,100]: L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e) obj = obj_func(L,C,gamma) c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % (e,term, n_iter, obj, num_pca_dimensions, c_perc)
def compare_eps(vectors): data_matrix = column_stack(vectors) O = (data_matrix != 0) * 1 # Observation matrix - 1 where we have data, 0 where we do not gamma = .5 tol_perc = .002 tol_perc = .00000001 for e in [2, 5, 10, 20, 50, 100]: L, C, term, n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e) obj = obj_func(L, C, gamma) c_vals = [(sum(square(C[:, i])) != 0) * 1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % ( e, term, n_iter, obj, num_pca_dimensions, c_perc)
# vectors - a list of Numpy column vectors # robust - True if RPCA via OP is desired # k - Number of PCs to use in PCA # gamma - gamma parameter for RPCA def computeMahalanobisDistances((key,vectors), robust=False, k=10, gamma=.5, tol_perc=1e-06): data_matrix = column_stack(vectors) if(robust): if(gamma=="tune"): gamma, tol_perc, num_guesses, hi_num_pcs, L, C = increasing_tolerance_search(vectors) (weekday, hour) = key logMsg("Successfully tuned %s @ %d after %d guesses : gamma=%f, tol=%f"%(weekday, hour, num_guesses, gamma, tol_perc)) else: O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not # Use outlier pursuit to get robust low-rank approximation of data L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc) #logMsg("PCA") # Perform PCA on the low-rank approximation, and estimate the statistics centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, k) num_pca_dimensions = pcs.shape[1] logMsg("Num eigenvalues : %d" % num_pca_dimensions) centered_corrupt = scale_and_center(L+C, reference_matrix=L, scale=False) stdout.flush() mahals5 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 5) mahals10 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt,10)