def compare_eps(vectors): data_matrix = column_stack(vectors) O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not gamma = .5 tol_perc = .002 tol_perc = .00000001 for e in [2,5,10,20,50,100]: L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e) obj = obj_func(L,C,gamma) c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % (e,term, n_iter, obj, num_pca_dimensions, c_perc)
def compare_eps(vectors): data_matrix = column_stack(vectors) O = (data_matrix != 0) * 1 # Observation matrix - 1 where we have data, 0 where we do not gamma = .5 tol_perc = .002 tol_perc = .00000001 for e in [2, 5, 10, 20, 50, 100]: L, C, term, n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e) obj = obj_func(L, C, gamma) c_vals = [(sum(square(C[:, i])) != 0) * 1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = pcs.shape[1] print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % ( e, term, n_iter, obj, num_pca_dimensions, c_perc)
def tune_gamma_and_tol(vectors, gamma_guess=.5, tol_guess=1e-2, lo_target_c_perc=.04, hi_target_c_perc = .10, lo_target_num_pcs=10, hi_target_num_pcs = 15): BACKTRACK_PROB=.0000001 MAX_NUM_RESETS = 5 num_resets = 0 data_matrix = column_stack(vectors) O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not #Initially, we don't have any bounds on our search lo_gamma = None hi_gamma = None gamma = gamma_guess lo_tol = None hi_tol = None tol_perc = tol_guess num_guesses = 0 while(True): num_guesses += 1 logMsg("BS (%d , %d): Trying gamma=%f, tol=%f" % (num_guesses, hi_target_num_pcs, gamma, tol_perc)) stdout.flush() try: #L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=30) L,C,term,n_iter = multiple_op(data_matrix, O, gamma, tol_perc=tol_perc) #centered_L = scale_and_center(L, scale=False) #pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = fast_rank(L) #otherrank = fast_rank(L) #print ("RANK %d , %d" % (num_pca_dimensions, otherrank)) c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) logMsg(">>>>>> pcs=%d, outliers=%f" % (num_pca_dimensions, c_perc)) # If we have found acceptable values, stop if(c_perc >= lo_target_c_perc and c_perc <= hi_target_c_perc and num_pca_dimensions >= lo_target_num_pcs and num_pca_dimensions <= hi_target_num_pcs): break # Otherwise, use our target values to figure out if we need to increase/decrease # gamma or tol if(num_pca_dimensions >= lo_target_num_pcs and c_perc >= lo_target_c_perc): logMsg("#rank too high and too many outliers - increase tolerance") lo_tol = tol_perc elif(num_pca_dimensions >= lo_target_num_pcs and c_perc <= lo_target_c_perc): logMsg("#rank too high and too few outliers - decrease gamma") hi_gamma = gamma elif(num_pca_dimensions <= lo_target_num_pcs and c_perc >= lo_target_c_perc): logMsg("#rank too low and too many outliers - increase gamma") lo_gamma = gamma elif(num_pca_dimensions <= lo_target_num_pcs and c_perc <= lo_target_c_perc): logMsg("#rank too low and too few outliers - decrease tolerance") hi_tol = tol_perc (gamma, lo_gamma, hi_gamma) = guess_param(gamma, lo_gamma, hi_gamma, SEARCH_RATE=2, BACKTRACK_PROB=BACKTRACK_PROB) (tol_perc, lo_tol, hi_tol) = guess_param(tol_perc, lo_tol, hi_tol, SEARCH_RATE=5, BACKTRACK_PROB=BACKTRACK_PROB, hard_upper_bound = 1.0) logMsg("%s < gamma < %s , %s < tol < %s" % tuple(map(str, [lo_gamma, hi_gamma, lo_tol,hi_tol]))) stdout.flush() except Exception as e: logMsg(e.message) lo_tol = .01 hi_tol = .01 if( (lo_tol !=None and hi_tol != None and hi_tol/lo_tol > .99 and hi_tol/lo_tol < 1.01) or (lo_gamma !=None and hi_gamma != None and hi_gamma/lo_gamma > .99 and hi_gamma/lo_gamma < 1.01)): print("----------------Got stuck") stdout.flush() BACKTRACK_PROB = .3 num_resets += 1 if(num_resets > MAX_NUM_RESETS): raise ConvergenceException(num_guesses) else: hi_gamma = None lo_gamma = None hi_tol = None lo_tol = None gamma *= (2**uniform(-1,1)) tol_perc *= (10**uniform(-1,1)) print obj_func(L,C,gamma) return gamma, tol_perc, num_guesses, L, C
def tune_gamma_and_tol(vectors, gamma_guess=.5, tol_guess=1e-2, lo_target_c_perc=.04, hi_target_c_perc=.10, lo_target_num_pcs=10, hi_target_num_pcs=15): BACKTRACK_PROB = .0000001 MAX_NUM_RESETS = 5 num_resets = 0 data_matrix = column_stack(vectors) O = (data_matrix != 0) * 1 # Observation matrix - 1 where we have data, 0 where we do not #Initially, we don't have any bounds on our search lo_gamma = None hi_gamma = None gamma = gamma_guess lo_tol = None hi_tol = None tol_perc = tol_guess num_guesses = 0 while (True): num_guesses += 1 logMsg("BS (%d , %d): Trying gamma=%f, tol=%f" % (num_guesses, hi_target_num_pcs, gamma, tol_perc)) stdout.flush() try: #L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=30) L, C, term, n_iter = multiple_op(data_matrix, O, gamma, tol_perc=tol_perc) #centered_L = scale_and_center(L, scale=False) #pcs, robust_lowdim_data = pca(centered_L, 100000) num_pca_dimensions = fast_rank(L) #otherrank = fast_rank(L) #print ("RANK %d , %d" % (num_pca_dimensions, otherrank)) c_vals = [(sum(square(C[:, i])) != 0) * 1 for i in xrange(C.shape[1])] c_perc = float(sum(c_vals)) / len(c_vals) logMsg(">>>>>> pcs=%d, outliers=%f" % (num_pca_dimensions, c_perc)) # If we have found acceptable values, stop if (c_perc >= lo_target_c_perc and c_perc <= hi_target_c_perc and num_pca_dimensions >= lo_target_num_pcs and num_pca_dimensions <= hi_target_num_pcs): break # Otherwise, use our target values to figure out if we need to increase/decrease # gamma or tol if (num_pca_dimensions >= lo_target_num_pcs and c_perc >= lo_target_c_perc): logMsg( "#rank too high and too many outliers - increase tolerance" ) lo_tol = tol_perc elif (num_pca_dimensions >= lo_target_num_pcs and c_perc <= lo_target_c_perc): logMsg("#rank too high and too few outliers - decrease gamma") hi_gamma = gamma elif (num_pca_dimensions <= lo_target_num_pcs and c_perc >= lo_target_c_perc): logMsg("#rank too low and too many outliers - increase gamma") lo_gamma = gamma elif (num_pca_dimensions <= lo_target_num_pcs and c_perc <= lo_target_c_perc): logMsg( "#rank too low and too few outliers - decrease tolerance") hi_tol = tol_perc (gamma, lo_gamma, hi_gamma) = guess_param(gamma, lo_gamma, hi_gamma, SEARCH_RATE=2, BACKTRACK_PROB=BACKTRACK_PROB) (tol_perc, lo_tol, hi_tol) = guess_param(tol_perc, lo_tol, hi_tol, SEARCH_RATE=5, BACKTRACK_PROB=BACKTRACK_PROB, hard_upper_bound=1.0) logMsg("%s < gamma < %s , %s < tol < %s" % tuple(map(str, [lo_gamma, hi_gamma, lo_tol, hi_tol]))) stdout.flush() except Exception as e: logMsg(e.message) lo_tol = .01 hi_tol = .01 if ((lo_tol != None and hi_tol != None and hi_tol / lo_tol > .99 and hi_tol / lo_tol < 1.01) or (lo_gamma != None and hi_gamma != None and hi_gamma / lo_gamma > .99 and hi_gamma / lo_gamma < 1.01)): print("----------------Got stuck") stdout.flush() BACKTRACK_PROB = .3 num_resets += 1 if (num_resets > MAX_NUM_RESETS): raise ConvergenceException(num_guesses) else: hi_gamma = None lo_gamma = None hi_tol = None lo_tol = None gamma *= (2**uniform(-1, 1)) tol_perc *= (10**uniform(-1, 1)) print obj_func(L, C, gamma) return gamma, tol_perc, num_guesses, L, C