def wpca_subspace(elements, embedding_matrix, weight_array, vector_dim, mean_centering, numComponents, debugInfo): ferr = open("errors_wpca_representation", "a+") flog = open("logs_pca_representation", "a+") weight_matrix = np.tile(weight_array.reshape(-1, 1), vector_dim) if embedding_matrix.ndim == 1: # only one word in the sentence, do nothing (no PCA), the vector-space of the word itself is the subspace ferr.write("[No WPCA]: Only a single element from " + " ".join(elements) + " found in supplied embeddings for the document" + "_".join(debugInfo) + "\n") subspace = embedding_matrix singularValues = np.array([1.0]) energyRetained = 1.0 else: flog.write("Original NumComponents: " + str(numComponents) + " NumElements: " + str(embedding_matrix.shape[0]) + "\t") numComponents = min(embedding_matrix.shape[0], embedding_matrix.shape[1], numComponents) flog.write("New NumComponents: " + str(numComponents) + "\n") pca = WPCA(n_components=numComponents, mean_centering=mean_centering ) #WPCA centers the matrix automatically try: kwds = {'weights': weight_matrix} pca.fit(embedding_matrix, **kwds) subspace = pca.components_ if numComponents == 1: # convert matrix to vector when numComponents = 1 subspace = subspace.T.reshape(-1) energyRetained = np.sum(pca.explained_variance_ratio_) if np.any(pca.explained_variance_ < 0): # Hack explained_variance = np.abs(pca.explained_variance_) ferr.write("[Numerical Precision Error]: Negative variance " + str(pca.explained_variance_) + " in subspace constructed for " + " ".join(elements) + " in the document: " + "_".join(debugInfo) + "\n") else: explained_variance = pca.explained_variance_ #singularValues = np.sqrt( explained_variance * (embedding_matrix.shape[0] - 1) ) singularValues = np.sqrt(explained_variance) except ( np.linalg.LinAlgError, ZeroDivisionError ) as e: # Fails (svd doesn't converge) for some reason. Use the word-vector average in this case! ferr.write("[WPCA Error]: No subspace constructed for " + " ".join(elements) + " in the document: " + "_".join(debugInfo) + "\n") subspace = np.mean(embedding_matrix, axis=0) singularValues = np.array([1.0]) energyRetained = 1.0 ferr.close() flog.close() return subspace, singularValues, energyRetained
def get_pca(input_: Array, learn_input: Array, learn_weight_vec: Opt[Array], n_comp_list: Iterable[int], err_printer: Callable[[Array, Array, str], None] = None, normalize_x: bool = True, normalize_z: bool = False) -> LinearAnalyzer: """ The last from ``n_comp_list`` would be returned. """ def expl(pca_): return np.round(np.sum(pca_.explained_variance_ratio_), 2) n_comp_list = list(n_comp_list) x = x_normalized = learn_input # (~6000, ~162) weight_vec = learn_weight_vec μ_x: Union[Array, int] = 0 σ_x: Union[Array, int] = 1 if normalize_x: x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec) weight_vec_as_mat = weights_matrix(weight_vec, x) if (weight_vec is not None) else None for j, i in enumerate(n_comp_list): pca = ClassWPCA(i) pca.fit(x_normalized, weights=weight_vec_as_mat) z: Array = pca.transform(x_normalized) inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z( z, weight_vec, normalize_z, x_normalized) an = LinearAnalyzer(n=pca.n_components, analyzer=pca, x=input_, μ_x=μ_x, σ_x=σ_x, μ_z=μ_z, σ_z=σ_z, inverse_transform_matrix=inverse_transform_matrix, normalize_x=normalize_x, normalize_z=normalize_z) if err_printer is not None: pref = f"Expl = {expl(pca)}, PC N = {pca.n_components}, " err_printer(input_, an.x_rec, pref) if (j + 1) == len(n_comp_list): break else: raise ValueError('Empty n_comp_list') return an
class CleanSpectra(object): def __init__(self, min_wavelength=3500, max_wavelength=8300, max_masked_fraction=1.0): self.min_wavelength = min_wavelength self.max_wavelength = max_wavelength self.max_masked_fraction = max_masked_fraction def load_data(self, h5file, selection=None): if not isinstance(selection, slice): selection = slice(selection) datafile = h5py.File(h5file, 'r') wavelengths = 10**datafile['log_wavelengths'][:] mask = ((wavelengths >= self.min_wavelength) & (wavelengths <= self.max_wavelength)) self.wavelengths = wavelengths[mask] self.spectra = datafile['spectra'][selection, mask] self.weights = datafile['ivars'][selection, mask] datafile.close() # remove rows with excessive missing data good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction self.spectra = self.spectra[good_rows] self.weights = self.weights[good_rows] self.weights **= 0.5 return self def fit_wpca(self, n_components=200, regularization=False): self.wpca = WPCA(n_components=n_components, regularization=regularization) self.wpca.fit(self.spectra, weights=self.weights) return self def reconstruct(self, spectra=None, weights=None, p=2): if spectra is None: spectra = self.spectra if weights is None: weights = self.weights new_spectra = self.wpca.reconstruct(spectra, weights=weights) SN = abs(spectra * weights)**(1. / p) SN /= SN.max(1, keepdims=True) return SN * spectra + (1 - SN) * new_spectra
class CleanSpectra(object): def __init__(self, min_wavelength=3500, max_wavelength=8300, max_masked_fraction=1.0): self.min_wavelength = min_wavelength self.max_wavelength = max_wavelength self.max_masked_fraction = max_masked_fraction def load_data(self, h5file, selection=None): if not isinstance(selection, slice): selection = slice(selection) datafile = h5py.File(h5file, 'r') wavelengths = 10 ** datafile['log_wavelengths'][:] mask = ((wavelengths >= self.min_wavelength) & (wavelengths <= self.max_wavelength)) self.wavelengths = wavelengths[mask] self.spectra = datafile['spectra'][selection, mask] self.weights = datafile['ivars'][selection, mask] datafile.close() # remove rows with excessive missing data good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction self.spectra = self.spectra[good_rows] self.weights = self.weights[good_rows] self.weights **= 0.5 return self def fit_wpca(self, n_components=200, regularization=False): self.wpca = WPCA(n_components=n_components, regularization=regularization) self.wpca.fit(self.spectra, weights=self.weights) return self def reconstruct(self, spectra=None, weights=None, p=2): if spectra is None: spectra = self.spectra if weights is None: weights = self.weights new_spectra = self.wpca.reconstruct(spectra, weights=weights) SN = abs(spectra * weights) ** (1. / p) SN /= SN.max(1, keepdims=True) return SN * spectra + (1 - SN) * new_spectra
def test_copy_data(): rand = np.random.RandomState(0) X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100) W = rand.rand(*X.shape) X_orig = X.copy() # with copy_data=True, X should not change pca1 = WPCA(copy_data=True) pca1.fit(X, weights=W) assert np.all(X == X_orig) # with copy_data=False, X should be overwritten pca2 = WPCA(copy_data=False) pca2.fit(X, weights=W) assert not np.allclose(X, X_orig) # all results should match assert_allclose(pca1.mean_, pca2.mean_) assert_allclose(pca1.components_, pca2.components_) assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_params = pd.read_csv(final_file, header=None).values[0] logger.log("grab start params") start_file = get_full_param_traj_file_path(traj_params_dir_name, "start") start_params = pd.read_csv(start_file, header=None).values[0] V = final_params - start_params ''' ========================================================================================== get the pc vectors ========================================================================================== ''' result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=False, origin="mean_param", use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=True) logger.debug("after pca") final_plane = result["first_n_pcs"] count_file = get_full_param_traj_file_path(traj_params_dir_name, "total_num_dumped") total_num = pd.read_csv(count_file, header=None).values[0] all_param_iterator = get_allinone_concat_df( dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size) unduped_angles_along_the_way = [] duped_angles_along_the_way = [] diff_along = [] unweighted_pc1_vs_V_angles = [] duped_pc1_vs_V_angles = [] pc1_vs_V_diffs = [] unweighted_ipca = IncrementalPCA( n_components=cma_args.n_comp_to_use) # for sparse PCA to speed up all_matrix_buffer = [] try: i = -1 for chunk in all_param_iterator: i += 1 if i >= 2: break chunk = chunk.values unweighted_ipca.partial_fit(chunk) unweighted_angle = cal_angle_between_nd_planes( final_plane, unweighted_ipca.components_[:cma_args.n_comp_to_use]) unweighted_pc1_vs_V_angle = postize_angle( cal_angle_between_nd_planes(V, unweighted_ipca.components_[0])) unweighted_pc1_vs_V_angles.append(unweighted_pc1_vs_V_angle) #TODO ignore 90 or 180 for now if unweighted_angle > 90: unweighted_angle = 180 - unweighted_angle unduped_angles_along_the_way.append(unweighted_angle) np.testing.assert_almost_equal( cal_angle_between_nd_planes( unweighted_ipca.components_[:cma_args.n_comp_to_use][0], final_plane[0]), cal_angle( unweighted_ipca.components_[:cma_args.n_comp_to_use][0], final_plane[0])) all_matrix_buffer.extend(chunk) weights = gen_weights(all_matrix_buffer, Funcs[cma_args.func_index_to_use]) logger.log(f"currently at {all_param_iterator._currow}") # ipca = PCA(n_components=1) # for sparse PCA to speed up # ipca.fit(duped_in_so_far) wpca = WPCA(n_components=cma_args.n_comp_to_use ) # for sparse PCA to speed up tic = time.time() wpca.fit(all_matrix_buffer, weights=weights) toc = time.time() logger.debug( f"WPCA of {len(all_matrix_buffer)} data took {toc - tic} secs " ) duped_angle = cal_angle_between_nd_planes( final_plane, wpca.components_[:cma_args.n_comp_to_use]) duped_pc1_vs_V_angle = postize_angle( cal_angle_between_nd_planes(V, wpca.components_[0])) duped_pc1_vs_V_angles.append(duped_pc1_vs_V_angle) pc1_vs_V_diffs.append(duped_pc1_vs_V_angle - unweighted_pc1_vs_V_angle) #TODO ignore 90 or 180 for now if duped_angle > 90: duped_angle = 180 - duped_angle duped_angles_along_the_way.append(duped_angle) diff_along.append(unweighted_angle - duped_angle) finally: plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) angles_plot_name = f"WPCA" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(duped_angles_along_the_way)), duped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"Not WPCA exponential 2" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(unduped_angles_along_the_way)), unduped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"Not WPCA - WPCA diff_along exponential 2," \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)), diff_along, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VWPCA PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(duped_pc1_vs_V_angles)), duped_pc1_vs_V_angles, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VNot WPCA PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(unweighted_pc1_vs_V_angles)), unweighted_pc1_vs_V_angles, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VNot WPCA - WPCA diff PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(pc1_vs_V_diffs)), pc1_vs_V_diffs, "num of chunks", "angle with diff in degrees", False) del all_matrix_buffer import gc gc.collect()