Esempio n. 1
0
def compare_eps(vectors):
    data_matrix = column_stack(vectors)
    O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not
    
    
    gamma = .5
    tol_perc = .002
    tol_perc = .00000001
    for e in [2,5,10,20,50,100]:
        L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e)
        obj = obj_func(L,C,gamma)
        c_vals = [(sum(square(C[:,i]))!=0)*1 for i in  xrange(C.shape[1])]
        c_perc = float(sum(c_vals)) / len(c_vals)
        centered_L = scale_and_center(L, scale=False)
        pcs, robust_lowdim_data = pca(centered_L, 100000)
        num_pca_dimensions = pcs.shape[1]
        print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % (e,term, n_iter, obj, num_pca_dimensions, c_perc)
Esempio n. 2
0
def compare_eps(vectors):
    data_matrix = column_stack(vectors)
    O = (data_matrix !=
         0) * 1  # Observation matrix - 1 where we have data, 0 where we do not

    gamma = .5
    tol_perc = .002
    tol_perc = .00000001
    for e in [2, 5, 10, 20, 50, 100]:
        L, C, term, n_iter = opursuit(data_matrix,
                                      O,
                                      gamma,
                                      tol_perc=tol_perc,
                                      eps_ratio=e)
        obj = obj_func(L, C, gamma)
        c_vals = [(sum(square(C[:, i])) != 0) * 1 for i in xrange(C.shape[1])]
        c_perc = float(sum(c_vals)) / len(c_vals)
        centered_L = scale_and_center(L, scale=False)
        pcs, robust_lowdim_data = pca(centered_L, 100000)
        num_pca_dimensions = pcs.shape[1]
        print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % (
            e, term, n_iter, obj, num_pca_dimensions, c_perc)
Esempio n. 3
0
def tune_gamma_and_tol(vectors, gamma_guess=.5, tol_guess=1e-2,
                        lo_target_c_perc=.04, hi_target_c_perc = .10,
                        lo_target_num_pcs=10, hi_target_num_pcs = 15):
                            

    BACKTRACK_PROB=.0000001
    MAX_NUM_RESETS = 5
    num_resets = 0

    
    data_matrix = column_stack(vectors)
    O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not

    #Initially, we don't have any bounds on our search
    lo_gamma = None
    hi_gamma = None
    gamma = gamma_guess
    
    lo_tol = None
    hi_tol = None
    tol_perc = tol_guess
    
    num_guesses = 0
    while(True):
        num_guesses += 1
        logMsg("BS (%d , %d): Trying gamma=%f, tol=%f" % (num_guesses, hi_target_num_pcs, gamma, tol_perc))
        stdout.flush()
        
        try:
            #L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=30)
            L,C,term,n_iter = multiple_op(data_matrix, O, gamma, tol_perc=tol_perc)
            
            
            #centered_L = scale_and_center(L, scale=False)
            #pcs, robust_lowdim_data = pca(centered_L, 100000)
            num_pca_dimensions = fast_rank(L)
            
            #otherrank = fast_rank(L)
            #print ("RANK %d , %d" % (num_pca_dimensions, otherrank))
            
            c_vals = [(sum(square(C[:,i]))!=0)*1 for i in  xrange(C.shape[1])]
            c_perc = float(sum(c_vals)) / len(c_vals)
            
            logMsg(">>>>>> pcs=%d, outliers=%f" % (num_pca_dimensions, c_perc))
            
            
            # If we have found acceptable values, stop
            if(c_perc >= lo_target_c_perc and c_perc <= hi_target_c_perc
                and num_pca_dimensions >= lo_target_num_pcs
                and num_pca_dimensions <= hi_target_num_pcs):
                break
            
            # Otherwise, use our target values to figure out if we need to increase/decrease
            # gamma or tol
            
            if(num_pca_dimensions >= lo_target_num_pcs and c_perc >= lo_target_c_perc):
                logMsg("#rank too high and too many outliers - increase tolerance")
                lo_tol = tol_perc
            elif(num_pca_dimensions >= lo_target_num_pcs and c_perc <= lo_target_c_perc):
                logMsg("#rank too high and too few outliers - decrease gamma")
                hi_gamma = gamma
            elif(num_pca_dimensions <= lo_target_num_pcs and c_perc >= lo_target_c_perc):
                logMsg("#rank too low and too many outliers - increase gamma")
                lo_gamma = gamma
            elif(num_pca_dimensions <= lo_target_num_pcs and c_perc <= lo_target_c_perc):
                logMsg("#rank too low and too few outliers - decrease tolerance")
                hi_tol = tol_perc
    
            
            (gamma, lo_gamma, hi_gamma) = guess_param(gamma, lo_gamma, hi_gamma,
                                            SEARCH_RATE=2, BACKTRACK_PROB=BACKTRACK_PROB)
            (tol_perc, lo_tol, hi_tol) = guess_param(tol_perc, lo_tol, hi_tol,
                                            SEARCH_RATE=5, BACKTRACK_PROB=BACKTRACK_PROB,
                                            hard_upper_bound = 1.0)
            
    
            logMsg("%s < gamma < %s  ,  %s < tol < %s" % tuple(map(str, [lo_gamma, hi_gamma, lo_tol,hi_tol])))
            stdout.flush()

        except Exception as e:
            logMsg(e.message)
            lo_tol = .01
            hi_tol = .01
        
        
        if( (lo_tol !=None and hi_tol != None and hi_tol/lo_tol > .99 and hi_tol/lo_tol < 1.01)
            or (lo_gamma !=None and hi_gamma != None and hi_gamma/lo_gamma > .99 and hi_gamma/lo_gamma < 1.01)):
            print("----------------Got stuck")
            stdout.flush()
            BACKTRACK_PROB = .3
            num_resets += 1
            
            if(num_resets > MAX_NUM_RESETS):
                raise ConvergenceException(num_guesses)
            else:
                
                hi_gamma = None
                lo_gamma = None
                hi_tol = None
                lo_tol = None
                
                gamma *= (2**uniform(-1,1))
                tol_perc *= (10**uniform(-1,1))
        
    
    print obj_func(L,C,gamma)
    return gamma, tol_perc, num_guesses, L, C
Esempio n. 4
0
def tune_gamma_and_tol(vectors,
                       gamma_guess=.5,
                       tol_guess=1e-2,
                       lo_target_c_perc=.04,
                       hi_target_c_perc=.10,
                       lo_target_num_pcs=10,
                       hi_target_num_pcs=15):

    BACKTRACK_PROB = .0000001
    MAX_NUM_RESETS = 5
    num_resets = 0

    data_matrix = column_stack(vectors)
    O = (data_matrix !=
         0) * 1  # Observation matrix - 1 where we have data, 0 where we do not

    #Initially, we don't have any bounds on our search
    lo_gamma = None
    hi_gamma = None
    gamma = gamma_guess

    lo_tol = None
    hi_tol = None
    tol_perc = tol_guess

    num_guesses = 0
    while (True):
        num_guesses += 1
        logMsg("BS (%d , %d): Trying gamma=%f, tol=%f" %
               (num_guesses, hi_target_num_pcs, gamma, tol_perc))
        stdout.flush()

        try:
            #L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=30)
            L, C, term, n_iter = multiple_op(data_matrix,
                                             O,
                                             gamma,
                                             tol_perc=tol_perc)

            #centered_L = scale_and_center(L, scale=False)
            #pcs, robust_lowdim_data = pca(centered_L, 100000)
            num_pca_dimensions = fast_rank(L)

            #otherrank = fast_rank(L)
            #print ("RANK %d , %d" % (num_pca_dimensions, otherrank))

            c_vals = [(sum(square(C[:, i])) != 0) * 1
                      for i in xrange(C.shape[1])]
            c_perc = float(sum(c_vals)) / len(c_vals)

            logMsg(">>>>>> pcs=%d, outliers=%f" % (num_pca_dimensions, c_perc))

            # If we have found acceptable values, stop
            if (c_perc >= lo_target_c_perc and c_perc <= hi_target_c_perc
                    and num_pca_dimensions >= lo_target_num_pcs
                    and num_pca_dimensions <= hi_target_num_pcs):
                break

            # Otherwise, use our target values to figure out if we need to increase/decrease
            # gamma or tol

            if (num_pca_dimensions >= lo_target_num_pcs
                    and c_perc >= lo_target_c_perc):
                logMsg(
                    "#rank too high and too many outliers - increase tolerance"
                )
                lo_tol = tol_perc
            elif (num_pca_dimensions >= lo_target_num_pcs
                  and c_perc <= lo_target_c_perc):
                logMsg("#rank too high and too few outliers - decrease gamma")
                hi_gamma = gamma
            elif (num_pca_dimensions <= lo_target_num_pcs
                  and c_perc >= lo_target_c_perc):
                logMsg("#rank too low and too many outliers - increase gamma")
                lo_gamma = gamma
            elif (num_pca_dimensions <= lo_target_num_pcs
                  and c_perc <= lo_target_c_perc):
                logMsg(
                    "#rank too low and too few outliers - decrease tolerance")
                hi_tol = tol_perc

            (gamma, lo_gamma,
             hi_gamma) = guess_param(gamma,
                                     lo_gamma,
                                     hi_gamma,
                                     SEARCH_RATE=2,
                                     BACKTRACK_PROB=BACKTRACK_PROB)
            (tol_perc, lo_tol,
             hi_tol) = guess_param(tol_perc,
                                   lo_tol,
                                   hi_tol,
                                   SEARCH_RATE=5,
                                   BACKTRACK_PROB=BACKTRACK_PROB,
                                   hard_upper_bound=1.0)

            logMsg("%s < gamma < %s  ,  %s < tol < %s" %
                   tuple(map(str, [lo_gamma, hi_gamma, lo_tol, hi_tol])))
            stdout.flush()

        except Exception as e:
            logMsg(e.message)
            lo_tol = .01
            hi_tol = .01

        if ((lo_tol != None and hi_tol != None and hi_tol / lo_tol > .99
             and hi_tol / lo_tol < 1.01) or
            (lo_gamma != None and hi_gamma != None
             and hi_gamma / lo_gamma > .99 and hi_gamma / lo_gamma < 1.01)):
            print("----------------Got stuck")
            stdout.flush()
            BACKTRACK_PROB = .3
            num_resets += 1

            if (num_resets > MAX_NUM_RESETS):
                raise ConvergenceException(num_guesses)
            else:

                hi_gamma = None
                lo_gamma = None
                hi_tol = None
                lo_tol = None

                gamma *= (2**uniform(-1, 1))
                tol_perc *= (10**uniform(-1, 1))

    print obj_func(L, C, gamma)
    return gamma, tol_perc, num_guesses, L, C