def wpca_decomposition(data):
    weights = 0. + np.isfinite(data)
    kwds = {'weights': weights}
    pca = WPCA(n_components=1).fit(data, **kwds)
    eigen_samples = pca.transform(data)[:,0]
    eigen_genes = pca.components_[0,:]
    return eigen_genes, eigen_samples
Ejemplo n.º 2
0
def wpca_subspace(elements, embedding_matrix, weight_array, vector_dim,
                  mean_centering, numComponents, debugInfo):
    ferr = open("errors_wpca_representation", "a+")
    flog = open("logs_pca_representation", "a+")
    weight_matrix = np.tile(weight_array.reshape(-1, 1), vector_dim)

    if embedding_matrix.ndim == 1:  # only one word in the sentence, do nothing (no PCA), the vector-space of the word itself is the subspace
        ferr.write("[No WPCA]: Only a single element from " +
                   " ".join(elements) +
                   " found in supplied embeddings for the document" +
                   "_".join(debugInfo) + "\n")
        subspace = embedding_matrix
        singularValues = np.array([1.0])
        energyRetained = 1.0
    else:
        flog.write("Original NumComponents: " + str(numComponents) +
                   " NumElements: " + str(embedding_matrix.shape[0]) + "\t")
        numComponents = min(embedding_matrix.shape[0],
                            embedding_matrix.shape[1], numComponents)
        flog.write("New NumComponents: " + str(numComponents) + "\n")

        pca = WPCA(n_components=numComponents, mean_centering=mean_centering
                   )  #WPCA centers the matrix automatically
        try:
            kwds = {'weights': weight_matrix}
            pca.fit(embedding_matrix, **kwds)
            subspace = pca.components_
            if numComponents == 1:  # convert matrix to vector when numComponents = 1
                subspace = subspace.T.reshape(-1)
            energyRetained = np.sum(pca.explained_variance_ratio_)

            if np.any(pca.explained_variance_ < 0):  # Hack
                explained_variance = np.abs(pca.explained_variance_)
                ferr.write("[Numerical Precision Error]: Negative variance " +
                           str(pca.explained_variance_) +
                           " in subspace constructed for " +
                           " ".join(elements) + " in the document: " +
                           "_".join(debugInfo) + "\n")
            else:
                explained_variance = pca.explained_variance_
            #singularValues = np.sqrt( explained_variance * (embedding_matrix.shape[0] - 1) )
            singularValues = np.sqrt(explained_variance)
        except (
                np.linalg.LinAlgError, ZeroDivisionError
        ) as e:  # Fails (svd doesn't converge) for some reason. Use the word-vector average in this case!
            ferr.write("[WPCA Error]: No subspace constructed for " +
                       " ".join(elements) + " in the document: " +
                       "_".join(debugInfo) + "\n")
            subspace = np.mean(embedding_matrix, axis=0)
            singularValues = np.array([1.0])
            energyRetained = 1.0
    ferr.close()
    flog.close()
    return subspace, singularValues, energyRetained
def get_pca(input_: Array,
            learn_input: Array,
            learn_weight_vec: Opt[Array],
            n_comp_list: Iterable[int],
            err_printer: Callable[[Array, Array, str], None] = None,
            normalize_x: bool = True,
            normalize_z: bool = False) -> LinearAnalyzer:
    """ The last from ``n_comp_list`` would be returned. """
    def expl(pca_):
        return np.round(np.sum(pca_.explained_variance_ratio_), 2)

    n_comp_list = list(n_comp_list)

    x = x_normalized = learn_input  # (~6000, ~162)
    weight_vec = learn_weight_vec
    μ_x: Union[Array, int] = 0
    σ_x: Union[Array, int] = 1
    if normalize_x:
        x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec)
    weight_vec_as_mat = weights_matrix(weight_vec,
                                       x) if (weight_vec is not None) else None

    for j, i in enumerate(n_comp_list):
        pca = ClassWPCA(i)
        pca.fit(x_normalized, weights=weight_vec_as_mat)
        z: Array = pca.transform(x_normalized)

        inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z(
            z, weight_vec, normalize_z, x_normalized)

        an = LinearAnalyzer(n=pca.n_components,
                            analyzer=pca,
                            x=input_,
                            μ_x=μ_x,
                            σ_x=σ_x,
                            μ_z=μ_z,
                            σ_z=σ_z,
                            inverse_transform_matrix=inverse_transform_matrix,
                            normalize_x=normalize_x,
                            normalize_z=normalize_z)

        if err_printer is not None:
            pref = f"Expl = {expl(pca)}, PC N = {pca.n_components}, "
            err_printer(input_, an.x_rec, pref)

        if (j + 1) == len(n_comp_list):
            break
    else:
        raise ValueError('Empty n_comp_list')
    return an
Ejemplo n.º 4
0
class CleanSpectra(object):
    def __init__(self,
                 min_wavelength=3500,
                 max_wavelength=8300,
                 max_masked_fraction=1.0):
        self.min_wavelength = min_wavelength
        self.max_wavelength = max_wavelength
        self.max_masked_fraction = max_masked_fraction

    def load_data(self, h5file, selection=None):
        if not isinstance(selection, slice):
            selection = slice(selection)

        datafile = h5py.File(h5file, 'r')
        wavelengths = 10**datafile['log_wavelengths'][:]
        mask = ((wavelengths >= self.min_wavelength) &
                (wavelengths <= self.max_wavelength))
        self.wavelengths = wavelengths[mask]
        self.spectra = datafile['spectra'][selection, mask]
        self.weights = datafile['ivars'][selection, mask]
        datafile.close()

        # remove rows with excessive missing data
        good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction
        self.spectra = self.spectra[good_rows]
        self.weights = self.weights[good_rows]
        self.weights **= 0.5
        return self

    def fit_wpca(self, n_components=200, regularization=False):
        self.wpca = WPCA(n_components=n_components,
                         regularization=regularization)
        self.wpca.fit(self.spectra, weights=self.weights)
        return self

    def reconstruct(self, spectra=None, weights=None, p=2):
        if spectra is None:
            spectra = self.spectra
        if weights is None:
            weights = self.weights

        new_spectra = self.wpca.reconstruct(spectra, weights=weights)
        SN = abs(spectra * weights)**(1. / p)
        SN /= SN.max(1, keepdims=True)
        return SN * spectra + (1 - SN) * new_spectra
Ejemplo n.º 5
0
def component_removal(data, n_comp):
    mean = data.mean(axis=1)
    data = data.sub(mean, axis=0)

    dataT = data.T.values

    weights = 0 + np.isfinite(dataT)
    kwds = {'weights': weights}

    pca = WPCA(n_components=30).fit(dataT, **kwds)  #Fit data to model

    reconstruction = np.dot(
        pca.transform(dataT)[:, n_comp:], pca.components_[n_comp:, :])
    reconst_df = pd.DataFrame(data=reconstruction.T,
                              columns=data.columns,
                              index=data.index)
    reconst_df = reconst_df.add(mean, axis=0)

    return reconst_df
Ejemplo n.º 6
0
class CleanSpectra(object):
    def __init__(self, min_wavelength=3500, max_wavelength=8300,
                 max_masked_fraction=1.0):
        self.min_wavelength = min_wavelength
        self.max_wavelength = max_wavelength
        self.max_masked_fraction = max_masked_fraction

    def load_data(self, h5file, selection=None):
        if not isinstance(selection, slice):
            selection = slice(selection)

        datafile = h5py.File(h5file, 'r')
        wavelengths = 10 ** datafile['log_wavelengths'][:]
        mask = ((wavelengths >= self.min_wavelength) &
                (wavelengths <= self.max_wavelength))
        self.wavelengths = wavelengths[mask]
        self.spectra = datafile['spectra'][selection, mask]
        self.weights = datafile['ivars'][selection, mask]
        datafile.close()

        # remove rows with excessive missing data
        good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction
        self.spectra = self.spectra[good_rows]
        self.weights = self.weights[good_rows]
        self.weights **= 0.5
        return self

    def fit_wpca(self, n_components=200, regularization=False):
        self.wpca = WPCA(n_components=n_components,
                         regularization=regularization)
        self.wpca.fit(self.spectra, weights=self.weights)
        return self

    def reconstruct(self, spectra=None, weights=None, p=2):
        if spectra is None:
            spectra = self.spectra
        if weights is None:
            weights = self.weights

        new_spectra = self.wpca.reconstruct(spectra, weights=weights)
        SN = abs(spectra * weights) ** (1. / p)
        SN /= SN.max(1, keepdims=True)
        return SN * spectra + (1 - SN) * new_spectra
Ejemplo n.º 7
0
def weighted_PCA(df, n_pc=1, standardize=True):
    '''
    Function for performing the PCA, using sklearn.

    df - Dataframe with expression values
    '''
    x = df.values.T  #Set x as transpose of only the numerical values of the dataframe
    if standardize:
        standardizer = StandardScaler()
        x2 = standardizer.fit_transform(
            x
        )  #Standardize the data (center to mean and scale to unit variance)
    else:
        x2 = x

    x2 = np.nan_to_num(
        x2
    )  #Change back NaN values to 0, so array is accepted by the PCA function

    weights = 0 + np.isfinite(x)
    kwds = {'weights': weights}
    n_pcs = min(df.shape[0], n_pc)
    pca = WPCA(n_components=n_pcs).fit(x2, **kwds)  #Fit data to model
    expl = pca.explained_variance_ratio_
    x3 = pca.transform(
        x2, **kwds
    )  #Transform the data (apply dimensionality reduciton) and set x3 as principal components
    out_df = pd.DataFrame(
        x3.T, index=list(range(1, n_pcs + 1)), columns=df.columns
    ).T  #Create dataframe with vlues from the PCA and set columnindex as the PC number

    cont = pd.DataFrame(index=df.index)
    for i in range(n_pcs):
        cont.loc[:, f'PC{i+1} contribution'] = pca.components_[i]**2
    cont.sort_values(by='PC1 contribution', ascending=False, inplace=True)

    while n_pcs < n_pc:
        expl = np.append(expl, float('NaN'))
        n_pcs += 1
        out_df.loc[:, str(n_pcs)] = float('NaN')

    return out_df, expl, cont
Ejemplo n.º 8
0
def test_copy_data():
    rand = np.random.RandomState(0)
    X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100)
    W = rand.rand(*X.shape)
    X_orig = X.copy()

    # with copy_data=True, X should not change
    pca1 = WPCA(copy_data=True)
    pca1.fit(X, weights=W)
    assert np.all(X == X_orig)

    # with copy_data=False, X should be overwritten
    pca2 = WPCA(copy_data=False)
    pca2.fit(X, weights=W)
    assert not np.allclose(X, X_orig)

    # all results should match
    assert_allclose(pca1.mean_, pca2.mean_)
    assert_allclose(pca1.components_, pca2.components_)
    assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
Ejemplo n.º 9
0
def test_copy_data():
    rand = np.random.RandomState(0)
    X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100)
    W = rand.rand(*X.shape)
    X_orig = X.copy()

    # with copy_data=True, X should not change
    pca1 = WPCA(copy_data=True)
    pca1.fit(X, weights=W)
    assert np.all(X == X_orig)

    # with copy_data=False, X should be overwritten
    pca2 = WPCA(copy_data=False)
    pca2.fit(X, weights=W)
    assert not np.allclose(X, X_orig)

    # all results should match
    assert_allclose(pca1.mean_, pca2.mean_)
    assert_allclose(pca1.components_, pca2.components_)
    assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
Ejemplo n.º 10
0
  ####
  # Half axes
  axes= [10.0, 1.0]
  # Rotate elispse [rad]
  angles= [np.pi*0.1]
  # Origin shift
  orig = [5.0, -3.0]
  x, w = elipsoid( axes= axes, angles= angles, orig = orig, n=400)

  ax[0, 1].plot( x[:, 0], x[:, 1], 'o' )

  # PCA
  kwds = {}
  ncomp = 2
  pca = WPCA(n_components=ncomp).fit(x, **kwds)
  Y = WPCA(n_components=ncomp).fit_reconstruct(x, **kwds)
  means_ =  pca.mean_
  sigmas_ =  np.sqrt(pca.explained_variance_)
  vectors_ = pca.components_[:ncomp]
  print("Components \n", vectors_)
  print("Sigmas", sigmas_ )
  print("Means", means_ )

 # Not used here
 # ax[1, 1].plot(np.arange(1, ncomp+1), pca.explained_variance_ratio_)


  plotPCA(  ax[1,1], pca, x)

  fig.suptitle("Test PCA, WPCA", fontsize=16)
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    logger.log("grab start params")
    start_file = get_full_param_traj_file_path(traj_params_dir_name, "start")
    start_params = pd.read_csv(start_file, header=None).values[0]

    V = final_params - start_params
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    result = do_pca(cma_args.n_components,
                    cma_args.n_comp_to_use,
                    traj_params_dir_name,
                    intermediate_data_dir,
                    proj=False,
                    origin="mean_param",
                    use_IPCA=cma_args.use_IPCA,
                    chunk_size=cma_args.chunk_size,
                    reuse=True)
    logger.debug("after pca")

    final_plane = result["first_n_pcs"]

    count_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "total_num_dumped")
    total_num = pd.read_csv(count_file, header=None).values[0]

    all_param_iterator = get_allinone_concat_df(
        dir_name=traj_params_dir_name,
        use_IPCA=True,
        chunk_size=cma_args.pc1_chunk_size)
    unduped_angles_along_the_way = []
    duped_angles_along_the_way = []
    diff_along = []

    unweighted_pc1_vs_V_angles = []
    duped_pc1_vs_V_angles = []
    pc1_vs_V_diffs = []

    unweighted_ipca = IncrementalPCA(
        n_components=cma_args.n_comp_to_use)  # for sparse PCA to speed up

    all_matrix_buffer = []

    try:
        i = -1
        for chunk in all_param_iterator:
            i += 1
            if i >= 2:
                break
            chunk = chunk.values
            unweighted_ipca.partial_fit(chunk)
            unweighted_angle = cal_angle_between_nd_planes(
                final_plane,
                unweighted_ipca.components_[:cma_args.n_comp_to_use])
            unweighted_pc1_vs_V_angle = postize_angle(
                cal_angle_between_nd_planes(V, unweighted_ipca.components_[0]))

            unweighted_pc1_vs_V_angles.append(unweighted_pc1_vs_V_angle)

            #TODO ignore 90 or 180 for now
            if unweighted_angle > 90:
                unweighted_angle = 180 - unweighted_angle
            unduped_angles_along_the_way.append(unweighted_angle)

            np.testing.assert_almost_equal(
                cal_angle_between_nd_planes(
                    unweighted_ipca.components_[:cma_args.n_comp_to_use][0],
                    final_plane[0]),
                cal_angle(
                    unweighted_ipca.components_[:cma_args.n_comp_to_use][0],
                    final_plane[0]))

            all_matrix_buffer.extend(chunk)

            weights = gen_weights(all_matrix_buffer,
                                  Funcs[cma_args.func_index_to_use])
            logger.log(f"currently at {all_param_iterator._currow}")
            # ipca = PCA(n_components=1)  # for sparse PCA to speed up
            # ipca.fit(duped_in_so_far)
            wpca = WPCA(n_components=cma_args.n_comp_to_use
                        )  # for sparse PCA to speed up
            tic = time.time()
            wpca.fit(all_matrix_buffer, weights=weights)
            toc = time.time()

            logger.debug(
                f"WPCA of {len(all_matrix_buffer)} data took {toc - tic} secs "
            )
            duped_angle = cal_angle_between_nd_planes(
                final_plane, wpca.components_[:cma_args.n_comp_to_use])

            duped_pc1_vs_V_angle = postize_angle(
                cal_angle_between_nd_planes(V, wpca.components_[0]))
            duped_pc1_vs_V_angles.append(duped_pc1_vs_V_angle)
            pc1_vs_V_diffs.append(duped_pc1_vs_V_angle -
                                  unweighted_pc1_vs_V_angle)

            #TODO ignore 90 or 180 for now
            if duped_angle > 90:
                duped_angle = 180 - duped_angle
            duped_angles_along_the_way.append(duped_angle)
            diff_along.append(unweighted_angle - duped_angle)
    finally:
        plot_dir = get_plot_dir(cma_args)
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        angles_plot_name = f"WPCA" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(duped_angles_along_the_way)),
                duped_angles_along_the_way, "num of chunks",
                "angle with diff in degrees", False)

        angles_plot_name = f"Not WPCA exponential 2" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(unduped_angles_along_the_way)),
                unduped_angles_along_the_way, "num of chunks",
                "angle with diff in degrees", False)


        angles_plot_name = f"Not WPCA - WPCA diff_along exponential 2," \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)),
                diff_along, "num of chunks", "angle with diff in degrees",
                False)




        angles_plot_name = f"PC1 VS VWPCA PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(duped_pc1_vs_V_angles)), duped_pc1_vs_V_angles,
                "num of chunks", "angle with diff in degrees", False)

        angles_plot_name = f"PC1 VS VNot WPCA PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name,
                np.arange(len(unweighted_pc1_vs_V_angles)),
                unweighted_pc1_vs_V_angles, "num of chunks",
                "angle with diff in degrees", False)


        angles_plot_name = f"PC1 VS VNot WPCA - WPCA diff PC1 VS V" \
                           f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
        plot_2d(plot_dir, angles_plot_name, np.arange(len(pc1_vs_V_diffs)),
                pc1_vs_V_diffs, "num of chunks", "angle with diff in degrees",
                False)

        del all_matrix_buffer
        import gc
        gc.collect()
Ejemplo n.º 12
0
    def sketch(self, matrix, epochs=5, dim=80, verbose=False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int dim: sketch dimension
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        use_svd = True

        for epoch in range(epochs):
            shape = matrix.shape
            if use_svd:
                # sketch matrix
                gamma = np.random.random((shape[1], dim))
                # range sketch
                Y = matrix.dot(gamma)  # (N, dim)
                Q, R = np.linalg.qr(Y)  # (N, dim), A ~ Q @ Q.T @ A
                C = matrix.dot(Q).T  # (dim, N)

                # Truncated SVD
                Uc, sc, Vhc = scipy.linalg.svd(C, full_matrices=False)

                U_matrix = Q.dot(Uc)
                #sketch_matrix = matrix.dot(sketch)
                #sketch_matrix[np.isclose(sketch_matrix, 0)] = -1e8
                #U, s, Vh = scipy.linalg.svd(sketch_matrix, full_matrices=False)

                # Square root singular value
                self.word_vectors = U_matrix.dot(np.sqrt(np.diag(sc)))

                # # Normalized version
                # norms = np.sqrt(np.sum(np.square(U_matrix), axis=1, keepdims=True))
                # U_matrix /= np.maximum(norms, 1e-7)
                # self.word_vectors = U_matrix

            else:
                log_matrix = sp.coo_matrix(matrix)
                log_matrix.data = np.log(log_matrix.data)

                sketch = np.random.random((shape[1], dim))
                compressed = log_matrix.dot(sketch)
                compressed[np.isclose(compressed, 0)] = -1e8
                weights = matrix.dot(sketch)
                weights += np.random.random(weights.shape) * 0.01
                t_start = time.time()
                Y = WPCA(n_components=dim).fit_reconstruct(compressed,
                                                           weights=weights)
                print(f"PCA time: {time.time() - t_start:03f}")
                self.word_vectors = Y
            if not np.isfinite(self.word_vectors).all():
                raise Exception('Non-finite values in word vectors. '
                                'Try reducing the learning rate or the '
                                'max_loss parameter.')
Ejemplo n.º 13
0
 def fit_wpca(self, n_components=200, regularization=False):
     self.wpca = WPCA(n_components=n_components,
                      regularization=regularization)
     self.wpca.fit(self.spectra, weights=self.weights)
     return self
def main():

    # requires  n_comp_to_use, pc1_chunk_size
    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    cma_args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(cma_args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    '''
    ==========================================================================================
    get the pc vectors
    ==========================================================================================
    '''

    logger.log("grab final params")
    final_file = get_full_param_traj_file_path(traj_params_dir_name, "final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    logger.log("grab start params")
    start_file = get_full_param_traj_file_path(traj_params_dir_name, "start")
    start_params = pd.read_csv(start_file, header=None).values[0]

    count_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "total_num_dumped")
    total_num = pd.read_csv(count_file, header=None).values[0]

    V = final_params - start_params

    all_thetas_downsampled = get_allinone_concat_df(
        dir_name=traj_params_dir_name).values[::2]

    unduped_angles_along_the_way = []
    duped_angles_along_the_way = []
    diff_along = []
    num = 2  #TODO hardcode!
    undup_ipca = PCA(n_components=1)  # for sparse PCA to speed up

    all_matrix_buffer = []
    for chunk in all_param_iterator:
        chunk = chunk.values
        undup_ipca.partial_fit(chunk)
        unduped_angle = cal_angle(V, undup_ipca.components_[0])

        #TODO ignore 90 or 180 for now
        if unduped_angle > 90:
            unduped_angle = 180 - unduped_angle
        unduped_angles_along_the_way.append(unduped_angle)

        all_matrix_buffer.extend(chunk)

        weights = gen_weights(all_param_iterator._currow, total_num)
        duped_in_so_far = dup_so_far_buffer(all_matrix_buffer, last_percentage,
                                            num)

        logger.log(
            f"currently at {all_param_iterator._currow}, last_pecentage: {last_percentage}"
        )
        # ipca = PCA(n_components=1)  # for sparse PCA to speed up
        # ipca.fit(duped_in_so_far)
        ipca = WPCA(
            n_components=cma_args.n_comp_to_use)  # for sparse PCA to speed up
        for i in range(0, len(duped_in_so_far), cma_args.chunk_size):
            logger.log(
                f"partial fitting: i : {i} len(duped_in_so_far): {len(duped_in_so_far)}"
            )
            if i + cma_args.chunk_size > len(duped_in_so_far):
                ipca.partial_fit(duped_in_so_far[i:])
            else:
                ipca.partial_fit(duped_in_so_far[i:i + cma_args.chunk_size])

        duped_angle = cal_angle(V, ipca.components_[0])

        #TODO ignore 90 or 180 for now
        if duped_angle > 90:
            duped_angle = 180 - duped_angle
        duped_angles_along_the_way.append(duped_angle)
        diff_along.append(unduped_angle - duped_angle)

    plot_dir = get_plot_dir(cma_args)
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    angles_plot_name = f"duped exponential 2, num dup: {num}" \
                       f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
    plot_2d(plot_dir, angles_plot_name,
            np.arange(len(duped_angles_along_the_way)),
            duped_angles_along_the_way, "num of chunks",
            "angle with diff in degrees", False)

    angles_plot_name = f"unduped exponential 2, num dup: {num}" \
                       f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
    plot_2d(plot_dir, angles_plot_name,
            np.arange(len(unduped_angles_along_the_way)),
            unduped_angles_along_the_way, "num of chunks",
            "angle with diff in degrees", False)


    angles_plot_name = f"undup - dup diff_along exponential 2, num dup: {num}" \
                       f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} "
    plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)), diff_along,
            "num of chunks", "angle with diff in degrees", False)

    del all_matrix_buffer
    import gc
    gc.collect()
Ejemplo n.º 15
0
def getellipse ( histo, ratioECut=0.95, factorSigma=2.0 ):
    """
    ratioECut : max energy ration to select
    factorSigma :  fraction of the gaussian integral
                      1 sigma ~ 63 %
                      2 sigma ~ 95 %
                      3 sigma ~ 99.7 %
    """
    # Warning 0 : Transpose to have i - > X, j -> Y
    # histo = histo.T

    # Tolal energy cut of the cluster
    ecut = np.sum( histo) * (1. - ratioECut)

    # Find pixel ecut 'pcut'
    ind = np.where( histo > 0. )
    a = histo[ind]
    a = np.sort( a )
    # Find pixel ecut 'pcut'
    s = 0.0; i= 0
    while ( s < ecut ):
        pcut = s
        s = s + a[i]
        i = i+1
    pcut = s
    # print ("getellipse ecut, pcut", ecut, pcut)

    # Remove pixel < pcut
    ind = np.where( histo > pcut )
    # ??? x = np.where( a >= pcut, a, 0 )
    x = np.array(ind, dtype=np.float32)
    # Debug
    # print ( x.T )
    w = np.sqrt( histo[ ind] )
    w = [ w, w ]
    w = np.transpose( w )
    # Debug
    # print (w)
    # Debug lin. regression
    """
    slope, intercept, r_value, p_value, std_err = stats.linregress(x[0],x[1])
    print("slope", slope, intercept)
    xmin = np.min(x[0])
    xmax = np.max(x[0])
    xs = np.array([xmin, xmax])
    ys = xs*slope +intercept
    plt.scatter(x[0], x[1])
    plt.plot(xs, ys)
    plt.xlim(0, 256)
    plt.ylim(0, 256)
    plt.show()
    """
    # PCA
    kwds = {'weights': w}
    ncomp = 2
    # Warning 0 : transpose
    pca = WPCA(n_components=ncomp).fit( np.transpose(x), **kwds)
    # Debug : compute covariance
    """
    print("Shape x, : ", x.shape, w.shape )
    cov = np.cov( x  ) # , aweights=w[:,0] )
    print("cov: ", cov)
    eigVal, eigVec = np.linalg.eig( cov)
    print("eig: ", eigVal)
    print("eig. vect: ", eigVec)
    """
    orig_ =  pca.mean_
    axes_ =  factorSigma * np.sqrt(pca.explained_variance_)
    vectors_ = pca.components_[:ncomp]
    # ellipse rotation
    # Debug
    # print( "sin=", vectors_[0][1], "cos=", vectors_[0][0] )
    angles_ = np.array( [np.arctan2( vectors_[0][1], vectors_[0][0]) ]  )
    """ DEBUG
    print("PCA Components \n", vectors_)
    print("PCA Sigmas, half axes", axes_ )
    print("PCA Means/ origin", orig_ )
    print("PCA Angles, ellipse rotation",  angles_ * 180.0 / np.pi )
    """

    # BBox
    # vectors_[1] is the smallest
    if ( np.abs( vectors_[0][0]) <= 10e-7):
        xmin  = - axes_[0] + orig[0]
        xmax = + axes_[0] + orig[0]
        ymin  = - axes_[1] + orig[1]
        ymax = + axes_[1] + orig[1]
    else:
        tgR =  vectors_[0][1] / vectors_[0][0]
        #  Y
        # ---
        # Derivate dx/dt = 0
        theta = np.array( [np.arctan2( - axes_[1] * tgR, axes_[0] ) ] )
        xmin = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta)[0][0]
        xmax =  ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta+np.pi)[0][0]
        if (xmin > xmax) : t = xmin; xmin = xmax; xmax = t
        # Debug
        #print ("PCA theta dX/dTheta, xmin, xmax = 0", theta* 180.0 / np.pi, xmin, xmax)
        #
        #  Y
        # ---
        # Derivate dy/dt = 0
        theta = np.array( [ np.arctan2( axes_[1] , axes_[0]*tgR ) ] )
        ymin = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta)[0][1]
        ymax =  ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta+np.pi)[0][1]
        if (ymin > ymax) : t = ymin; ymin = ymax; ymax = t
        # Debug
        # print ("PCA theta dY/dTheta, ymin, ymax = 0", theta* 180.0 / np.pi, ymin, ymax)

    # Warning 0 : inverse transpose to have in pixel or  matrix indices
    xmin = max( 0, xmin  )
    ymin = max( 0, ymin  )
    xmax = min( 255, xmax  )
    ymax = min( 255, ymax  )
    bbox = np.array( [xmin, xmax, ymin, ymax], dtype=np.float32 )
    # angles_ = angles_ - np.pi/2
    axes_ = np.array( [ axes_[0], axes_[1] ], dtype=np.float32 )
    orig_ = np.array( [ orig_[0], orig_[1]  ], dtype=np.float32 )
    # print("Angle :", angles_*180/np.pi)
    return bbox, axes_, angles_, orig_
Ejemplo n.º 16
0
 def fit_wpca(self, n_components=200, regularization=False):
     self.wpca = WPCA(n_components=n_components,
                      regularization=regularization)
     self.wpca.fit(self.spectra, weights=self.weights)
     return self
Ejemplo n.º 17
0
    def benchmark_complete(data, ending_density=.02, step=.01):
        '''
        Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation
        
        Output: Dataframe of output density and RMSE for each method with respect to each input density
        
        
        '''
        # removes min value that is greater than zero (checks density) in each iteration randomly chosen
        #density range to run
        nonzeroscount = np.count_nonzero(data)
        sizel = data.shape
        totalentr = sizel[0] * sizel[1]
        end = 0.02  # final density to test
        begin = (nonzeroscount / totalentr)  # Begning density of matrix given
        #step=.01 # step of density

        #intialize lists to store
        density_in = []
        RMSE_empca_scores = []
        RMSE_wpca_scores = []
        RMSE_sfi_scores = []
        RMSE_siv_scores = []
        RMSE_sni_scores = []
        RMSE_smi_scores = []
        RMSE_szi_scores = []
        RMSE_wmiC_scores = []
        RMSE_wmiP_scores = []
        Density_empca = []
        Density_wpca = []
        Density_sfi = []
        Density_siv = []
        Density_sni = []
        Density_smi = []
        Density_szi = []
        Density_wmiC = []
        Density_wmiP = []

        #radnomly remove values from known matrix and try to impute them

        for d in reversed(np.arange(end, begin, step)):
            otum = data.T.copy()

            #begin density check
            nonzeroscount = np.count_nonzero(otum)
            sizel = otum.shape
            totalentr = sizel[0] * sizel[1]

            while np.float64((nonzeroscount / totalentr)) > d:
                #remove a min frequency OTU and then check density
                j = np.random.randint(0, len(otum[:][:]) - 1)
                #make sure row is not all zero (all zero row causes singular matrix)
                if sum(list(otum[j][:])) < 1:
                    continue
                m = min(i for i in list(otum[j][:]) if i > 0)
                #make sure removing value will not result in zero row
                if sum(list(otum[j][:])) == m:
                    continue
                otum[j][list(otum[j][:]).index(m)] = 0
                #check denstiy to break
                nonzeroscount = float(np.count_nonzero(otum))
                sizel = otum.shape
                totalentr = float(sizel[0]) * float(sizel[1])

            # coherce float of the unknown and print new density
            print("Data table of %f generated" % d)
            otum = otum.T.astype(np.float64)

            # make zero unknown for fancy impute, avoid singular matrix by taking transpose
            otum2 = otum.T.copy()
            otum2 = otum2.astype(np.float64)
            otum2[otum2 == 0] = np.nan  #make unknown nan

            #WPCA and EMPCA

            #build wieghted matrix
            weight = otum.copy()
            for i in range(len(otum2.T)):
                for j in range(len(otum2.T[i])):
                    if otum2.T[i][j] == 0:
                        weight[i][j] = 1
                    else:
                        weight[i][j] = 1000

            print("Running EMPCA")
            EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight)
            print("Running WPCA")
            WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight)

            # fancy impute and zeros
            print("Nuclear Norm")
            sni = NuclearNormMinimization(min_value=(np.amin(otum2)),
                                          max_value=(np.amax(otum2))).complete(
                                              otum2.copy())
            print("Running Soft Impute")
            sfi = SoftImpute(shrinkage_value=None,
                             convergence_threshold=0.00001,
                             max_iters=1000,
                             max_rank=min(otum2.shape),
                             n_power_iterations=1,
                             init_fill_method="zero",
                             min_value=(np.amin(otum2)),
                             max_value=(np.amax(otum2)),
                             normalizer=None,
                             verbose=False).complete(otum2.copy())
            print("Running Iterative SVD")
            siv = IterativeSVD(rank=(min(otum2.shape) - 1),
                               convergence_threshold=0.00001,
                               max_iters=1000,
                               gradual_rank_increase=True,
                               svd_algorithm="arpack",
                               init_fill_method="zero",
                               min_value=(np.amin(otum2)),
                               max_value=(np.amax(otum2)),
                               verbose=False).complete(otum2.copy())
            print("Running Matrix Factorization")
            smi = MatrixFactorization(rank=(min(otum2.shape) - 1),
                                      initializer=np.random.randn,
                                      learning_rate=0.01,
                                      patience=5,
                                      l1_penalty=0.05,
                                      l2_penalty=0.05,
                                      min_improvement=0.01,
                                      max_gradient_norm=5,
                                      optimization_algorithm="adam",
                                      min_value=(np.amin(otum2)),
                                      max_value=(np.amax(otum2)),
                                      verbose=False).complete(otum2.copy())
            print("Imputing by filling with zeros for base comparison")
            szi = base.zeros(otum2.copy())
            print("Weighted Mean Interpolation without phylo-distance")
            wmiC = base.wmi_wrapper(X=otum2.copy())
            print("Weighted Mean Interpolation with phylo-distance")
            phylo = pd.read_csv(
                'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv'
            )
            wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo)

            # save the results

            #density in (after removed values)
            density_in.append(error.get_density(otum))

            # density imputed
            Density_empca.append(error.get_density(EMPCAi))
            Density_wpca.append(error.get_density(WPCAi))
            Density_sfi.append(error.get_density(sfi))
            Density_siv.append(error.get_density(siv))
            Density_sni.append(error.get_density(sni))
            Density_smi.append(error.get_density(smi))
            Density_szi.append(error.get_density(szi))
            Density_wmiC.append(error.get_density(wmiC))
            Density_wmiP.append(error.get_density(wmiP))

            # RMSE of imputed values
            missing_mask = np.isnan(
                otum2.T
            )  # masking to only check RMSE between values imputed and values removed
            RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask))
            RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask))
            RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask))
            RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask))
            RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask))
            RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask))
            RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask))
            RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask))
            RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask))

        RMSEmapping = pd.DataFrame({
            'Density':
            list(map(int, density_in)),
            'EMPCA':
            RMSE_empca_scores,
            'Matrix Factorization':
            RMSE_smi_scores,
            'WPCA':
            RMSE_wpca_scores,
            'Soft Impute':
            RMSE_sfi_scores,
            'Iterative SVD':
            RMSE_siv_scores,
            'Nuclear Norm Minimization':
            RMSE_sni_scores,
            'Zeros Replace Unknown':
            RMSE_szi_scores,
            'Weighted-Mean Interpolation Correlation':
            RMSE_wmiC_scores,
            'Weighted-Mean Interpolation Phylo':
            RMSE_wmiP_scores
        })
        RMSEmapping.set_index(['Density'], inplace=True)
        Out_density = pd.DataFrame({
            'density':
            list(map(int, density_in)),
            'EMPCA':
            Density_empca,
            'Matrix Factorization':
            Density_smi,
            'WPCA':
            Density_wpca,
            'Soft Impute':
            Density_sfi,
            'Iterative SVD':
            Density_siv,
            'Nuclear Norm Minimization':
            Density_sni,
            'Zeros Replace Unknown':
            Density_szi,
            'Weighted-Mean Interpolation Correlation':
            Density_wmiC,
            'Weighted-Mean Interpolation Phylo':
            Density_wmiP
        })
        Out_density.set_index(['density'], inplace=True)

        return Out_density, RMSEmapping
def eigensampleFromWPCA(matrix):
    '''Find a representation of each sample using wPCA to exclude NaNs.'''
    weights = 1.0 - np.isnan(matrix.T)
    pc = WPCA(n_components=1).fit_reconstruct(matrix.T, weights=weights)
    return pc.T