コード例 #1
0
ファイル: utils.py プロジェクト: helioxgroup/eigenthemes
def wpca_subspace(elements, embedding_matrix, weight_array, vector_dim,
                  mean_centering, numComponents, debugInfo):
    ferr = open("errors_wpca_representation", "a+")
    flog = open("logs_pca_representation", "a+")
    weight_matrix = np.tile(weight_array.reshape(-1, 1), vector_dim)

    if embedding_matrix.ndim == 1:  # only one word in the sentence, do nothing (no PCA), the vector-space of the word itself is the subspace
        ferr.write("[No WPCA]: Only a single element from " +
                   " ".join(elements) +
                   " found in supplied embeddings for the document" +
                   "_".join(debugInfo) + "\n")
        subspace = embedding_matrix
        singularValues = np.array([1.0])
        energyRetained = 1.0
    else:
        flog.write("Original NumComponents: " + str(numComponents) +
                   " NumElements: " + str(embedding_matrix.shape[0]) + "\t")
        numComponents = min(embedding_matrix.shape[0],
                            embedding_matrix.shape[1], numComponents)
        flog.write("New NumComponents: " + str(numComponents) + "\n")

        pca = WPCA(n_components=numComponents, mean_centering=mean_centering
                   )  #WPCA centers the matrix automatically
        try:
            kwds = {'weights': weight_matrix}
            pca.fit(embedding_matrix, **kwds)
            subspace = pca.components_
            if numComponents == 1:  # convert matrix to vector when numComponents = 1
                subspace = subspace.T.reshape(-1)
            energyRetained = np.sum(pca.explained_variance_ratio_)

            if np.any(pca.explained_variance_ < 0):  # Hack
                explained_variance = np.abs(pca.explained_variance_)
                ferr.write("[Numerical Precision Error]: Negative variance " +
                           str(pca.explained_variance_) +
                           " in subspace constructed for " +
                           " ".join(elements) + " in the document: " +
                           "_".join(debugInfo) + "\n")
            else:
                explained_variance = pca.explained_variance_
            #singularValues = np.sqrt( explained_variance * (embedding_matrix.shape[0] - 1) )
            singularValues = np.sqrt(explained_variance)
        except (
                np.linalg.LinAlgError, ZeroDivisionError
        ) as e:  # Fails (svd doesn't converge) for some reason. Use the word-vector average in this case!
            ferr.write("[WPCA Error]: No subspace constructed for " +
                       " ".join(elements) + " in the document: " +
                       "_".join(debugInfo) + "\n")
            subspace = np.mean(embedding_matrix, axis=0)
            singularValues = np.array([1.0])
            energyRetained = 1.0
    ferr.close()
    flog.close()
    return subspace, singularValues, energyRetained
def get_pca(input_: Array,
            learn_input: Array,
            learn_weight_vec: Opt[Array],
            n_comp_list: Iterable[int],
            err_printer: Callable[[Array, Array, str], None] = None,
            normalize_x: bool = True,
            normalize_z: bool = False) -> LinearAnalyzer:
    """ The last from ``n_comp_list`` would be returned. """
    def expl(pca_):
        return np.round(np.sum(pca_.explained_variance_ratio_), 2)

    n_comp_list = list(n_comp_list)

    x = x_normalized = learn_input  # (~6000, ~162)
    weight_vec = learn_weight_vec
    μ_x: Union[Array, int] = 0
    σ_x: Union[Array, int] = 1
    if normalize_x:
        x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec)
    weight_vec_as_mat = weights_matrix(weight_vec,
                                       x) if (weight_vec is not None) else None

    for j, i in enumerate(n_comp_list):
        pca = ClassWPCA(i)
        pca.fit(x_normalized, weights=weight_vec_as_mat)
        z: Array = pca.transform(x_normalized)

        inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z(
            z, weight_vec, normalize_z, x_normalized)

        an = LinearAnalyzer(n=pca.n_components,
                            analyzer=pca,
                            x=input_,
                            μ_x=μ_x,
                            σ_x=σ_x,
                            μ_z=μ_z,
                            σ_z=σ_z,
                            inverse_transform_matrix=inverse_transform_matrix,
                            normalize_x=normalize_x,
                            normalize_z=normalize_z)

        if err_printer is not None:
            pref = f"Expl = {expl(pca)}, PC N = {pca.n_components}, "
            err_printer(input_, an.x_rec, pref)

        if (j + 1) == len(n_comp_list):
            break
    else:
        raise ValueError('Empty n_comp_list')
    return an
コード例 #3
0
ファイル: spec_data.py プロジェクト: tk575/spec_data
class CleanSpectra(object):
    def __init__(self,
                 min_wavelength=3500,
                 max_wavelength=8300,
                 max_masked_fraction=1.0):
        self.min_wavelength = min_wavelength
        self.max_wavelength = max_wavelength
        self.max_masked_fraction = max_masked_fraction

    def load_data(self, h5file, selection=None):
        if not isinstance(selection, slice):
            selection = slice(selection)

        datafile = h5py.File(h5file, 'r')
        wavelengths = 10**datafile['log_wavelengths'][:]
        mask = ((wavelengths >= self.min_wavelength) &
                (wavelengths <= self.max_wavelength))
        self.wavelengths = wavelengths[mask]
        self.spectra = datafile['spectra'][selection, mask]
        self.weights = datafile['ivars'][selection, mask]
        datafile.close()

        # remove rows with excessive missing data
        good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction
        self.spectra = self.spectra[good_rows]
        self.weights = self.weights[good_rows]
        self.weights **= 0.5
        return self

    def fit_wpca(self, n_components=200, regularization=False):
        self.wpca = WPCA(n_components=n_components,
                         regularization=regularization)
        self.wpca.fit(self.spectra, weights=self.weights)
        return self

    def reconstruct(self, spectra=None, weights=None, p=2):
        if spectra is None:
            spectra = self.spectra
        if weights is None:
            weights = self.weights

        new_spectra = self.wpca.reconstruct(spectra, weights=weights)
        SN = abs(spectra * weights)**(1. / p)
        SN /= SN.max(1, keepdims=True)
        return SN * spectra + (1 - SN) * new_spectra
コード例 #4
0
ファイル: spec_data.py プロジェクト: jakevdp/spec_data
class CleanSpectra(object):
    def __init__(self, min_wavelength=3500, max_wavelength=8300,
                 max_masked_fraction=1.0):
        self.min_wavelength = min_wavelength
        self.max_wavelength = max_wavelength
        self.max_masked_fraction = max_masked_fraction

    def load_data(self, h5file, selection=None):
        if not isinstance(selection, slice):
            selection = slice(selection)

        datafile = h5py.File(h5file, 'r')
        wavelengths = 10 ** datafile['log_wavelengths'][:]
        mask = ((wavelengths >= self.min_wavelength) &
                (wavelengths <= self.max_wavelength))
        self.wavelengths = wavelengths[mask]
        self.spectra = datafile['spectra'][selection, mask]
        self.weights = datafile['ivars'][selection, mask]
        datafile.close()

        # remove rows with excessive missing data
        good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction
        self.spectra = self.spectra[good_rows]
        self.weights = self.weights[good_rows]
        self.weights **= 0.5
        return self

    def fit_wpca(self, n_components=200, regularization=False):
        self.wpca = WPCA(n_components=n_components,
                         regularization=regularization)
        self.wpca.fit(self.spectra, weights=self.weights)
        return self

    def reconstruct(self, spectra=None, weights=None, p=2):
        if spectra is None:
            spectra = self.spectra
        if weights is None:
            weights = self.weights

        new_spectra = self.wpca.reconstruct(spectra, weights=weights)
        SN = abs(spectra * weights) ** (1. / p)
        SN /= SN.max(1, keepdims=True)
        return SN * spectra + (1 - SN) * new_spectra
コード例 #5
0
ファイル: test_wpca_common.py プロジェクト: yutaka329/wpca
def test_copy_data():
    rand = np.random.RandomState(0)
    X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100)
    W = rand.rand(*X.shape)
    X_orig = X.copy()

    # with copy_data=True, X should not change
    pca1 = WPCA(copy_data=True)
    pca1.fit(X, weights=W)
    assert np.all(X == X_orig)

    # with copy_data=False, X should be overwritten
    pca2 = WPCA(copy_data=False)
    pca2.fit(X, weights=W)
    assert not np.allclose(X, X_orig)

    # all results should match
    assert_allclose(pca1.mean_, pca2.mean_)
    assert_allclose(pca1.components_, pca2.components_)
    assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
コード例 #6
0
ファイル: test_wpca_common.py プロジェクト: jakevdp/wpca
def test_copy_data():
    rand = np.random.RandomState(0)
    X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100)
    W = rand.rand(*X.shape)
    X_orig = X.copy()

    # with copy_data=True, X should not change
    pca1 = WPCA(copy_data=True)
    pca1.fit(X, weights=W)
    assert np.all(X == X_orig)

    # with copy_data=False, X should be overwritten
    pca2 = WPCA(copy_data=False)
    pca2.fit(X, weights=W)
    assert not np.allclose(X, X_orig)

    # all results should match
    assert_allclose(pca1.mean_, pca2.mean_)
    assert_allclose(pca1.components_, pca2.components_)
    assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    logger.log("grab start params")
    start_file = get_full_param_traj_file_path(traj_params_dir_name, "start")
    start_params = pd.read_csv(start_file, header=None).values[0]

    V = final_params - start_params
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    result = do_pca(cma_args.n_components,
                    cma_args.n_comp_to_use,
                    traj_params_dir_name,
                    intermediate_data_dir,
                    proj=False,
                    origin="mean_param",
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size,
                    reuse=True)
    logger.debug("after pca")

    final_plane = result["first_n_pcs"]

    count_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "total_num_dumped")
    total_num = pd.read_csv(count_file, header=None).values[0]

    all_param_iterator = get_allinone_concat_df(
        dir_name=traj_params_dir_name,
        use_IPCA=True,
        chunk_size=cma_args.pc1_chunk_size)
    unduped_angles_along_the_way = []
    duped_angles_along_the_way = []
    diff_along = []

    unweighted_pc1_vs_V_angles = []
    duped_pc1_vs_V_angles = []
    pc1_vs_V_diffs = []

    unweighted_ipca = IncrementalPCA(
        n_components=cma_args.n_comp_to_use)  # for sparse PCA to speed up

    all_matrix_buffer = []

    try:
        i = -1
        for chunk in all_param_iterator:
            i += 1
            if i >= 2:
                break
            chunk = chunk.values
            unweighted_ipca.partial_fit(chunk)
            unweighted_angle = cal_angle_between_nd_planes(
                final_plane,
                unweighted_ipca.components_[:cma_args.n_comp_to_use])
            unweighted_pc1_vs_V_angle = postize_angle(
                cal_angle_between_nd_planes(V, unweighted_ipca.components_[0]))

            unweighted_pc1_vs_V_angles.append(unweighted_pc1_vs_V_angle)

            #TODO ignore 90 or 180 for now
            if unweighted_angle > 90:
                unweighted_angle = 180 - unweighted_angle
            unduped_angles_along_the_way.append(unweighted_angle)

            np.testing.assert_almost_equal(
                cal_angle_between_nd_planes(
                    unweighted_ipca.components_[:cma_args.n_comp_to_use][0],
                    final_plane[0]),
                cal_angle(
                    unweighted_ipca.components_[:cma_args.n_comp_to_use][0],
                    final_plane[0]))

            all_matrix_buffer.extend(chunk)

            weights = gen_weights(all_matrix_buffer,
                                  Funcs[cma_args.func_index_to_use])
            logger.log(f"currently at {all_param_iterator._currow}")
            # ipca = PCA(n_components=1)  # for sparse PCA to speed up
            # ipca.fit(duped_in_so_far)
            wpca = WPCA(n_components=cma_args.n_comp_to_use
                        )  # for sparse PCA to speed up
            tic = time.time()
            wpca.fit(all_matrix_buffer, weights=weights)
            toc = time.time()

            logger.debug(
                f"WPCA of {len(all_matrix_buffer)} data took {toc - tic} secs "
            )
            duped_angle = cal_angle_between_nd_planes(
                final_plane, wpca.components_[:cma_args.n_comp_to_use])

            duped_pc1_vs_V_angle = postize_angle(
                cal_angle_between_nd_planes(V, wpca.components_[0]))
            duped_pc1_vs_V_angles.append(duped_pc1_vs_V_angle)
            pc1_vs_V_diffs.append(duped_pc1_vs_V_angle -
                                  unweighted_pc1_vs_V_angle)

            #TODO ignore 90 or 180 for now
            if duped_angle > 90:
                duped_angle = 180 - duped_angle
            duped_angles_along_the_way.append(duped_angle)
            diff_along.append(unweighted_angle - duped_angle)
    finally:
        plot_dir = get_plot_dir(cma_args)
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        angles_plot_name = f"WPCA" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(duped_angles_along_the_way)),
                duped_angles_along_the_way, "num of chunks",
                "angle with diff in degrees", False)

        angles_plot_name = f"Not WPCA exponential 2" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(unduped_angles_along_the_way)),
                unduped_angles_along_the_way, "num of chunks",
                "angle with diff in degrees", False)


        angles_plot_name = f"Not WPCA - WPCA diff_along exponential 2," \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)),
                diff_along, "num of chunks", "angle with diff in degrees",
                False)




        angles_plot_name = f"PC1 VS VWPCA PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(duped_pc1_vs_V_angles)), duped_pc1_vs_V_angles,
                "num of chunks", "angle with diff in degrees", False)

        angles_plot_name = f"PC1 VS VNot WPCA PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(unweighted_pc1_vs_V_angles)),
                unweighted_pc1_vs_V_angles, "num of chunks",
                "angle with diff in degrees", False)


        angles_plot_name = f"PC1 VS VNot WPCA - WPCA diff PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name, np.arange(len(pc1_vs_V_diffs)),
                pc1_vs_V_diffs, "num of chunks", "angle with diff in degrees",
                False)

        del all_matrix_buffer
        import gc
        gc.collect()