def get_zscore_matrix(M,L): # Compute z-scores by scaling and centering the records # scaling and centering is done with respect to L matrix instead of M, so # outliers are discarded z_matrix = scale_and_center(M, reference_matrix=L, scale=True) # The entries of M that are 0 will be incorrect, since these are missing data # replace those zscores with 0 to stay consistent missing_data_ids = where(M==0) z_matrix[missing_data_ids] = 0 return z_matrix
data_matrix = column_stack(vectors) if(robust): if(gamma=="tune"): gamma, tol_perc, num_guesses, hi_num_pcs, L, C = increasing_tolerance_search(vectors) (weekday, hour) = key logMsg("Successfully tuned %s @ %d after %d guesses : gamma=%f, tol=%f"%(weekday, hour, num_guesses, gamma, tol_perc)) else: O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not # Use outlier pursuit to get robust low-rank approximation of data L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc) #logMsg("PCA") # Perform PCA on the low-rank approximation, and estimate the statistics centered_L = scale_and_center(L, scale=False) pcs, robust_lowdim_data = pca(centered_L, k) num_pca_dimensions = pcs.shape[1] logMsg("Num eigenvalues : %d" % num_pca_dimensions) centered_corrupt = scale_and_center(L+C, reference_matrix=L, scale=False) stdout.flush() mahals5 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 5) mahals10 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt,10) mahals20 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 20) mahals50 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 50)