Esempio n. 1
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("FA")

        cls.X_lo = cls.X()[:10, :]
        df = pandas.DataFrame(data=cls.X_lo, columns=cls.features())
        cls._spark_lo = TestDimredAPI.spark().createDataFrame(df)

        cls.fa = FactorAnalysis(cls.spark(), 2, cls.features(), max_iter=5)
        cls.trans = cls.fa.fit_transform(cls._spark_lo)
        cls.trans = split_vector(cls.trans.data.select(FEATURES__),
                                 FEATURES__).toPandas().values
        model = cls.fa.model
        cls.W = model.loadings
        cls.ll = model.loglikelihood
        cls.psi = model.error_vcov

        cls.fa.fit(cls._spark_lo)
        cls.fittransform_data = cls.fa.transform(cls._spark_lo)
        cls.fittransform_data = split_vector(
            cls.fittransform_data.data.select(FEATURES__),
            FEATURES__).toPandas().values

        cls.sk_fa = sklearn.decomposition.FactorAnalysis(n_components=2,
                                                         max_iter=5,
                                                         random_state=23)
        cls.sk_fit = cls.sk_fa.fit(cls.X_lo)
        cls.sk_trans = cls.sk_fit.transform(cls.X_lo)
Esempio n. 2
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("ICA")

        cls.X_lo = cls.X()[:10, :]
        cls.X_lo = scale(cls.X_lo)
        df = pandas.DataFrame(data=cls.X_lo, columns=cls.features())
        cls._spark_lo = TestDimredAPI.spark().createDataFrame(df)

        cls.ica = ICA(cls.spark(), 2, cls.features())
        cls.trans = cls.ica.fit_transform(cls._spark_lo)
        cls.trans = split_vector(cls.trans.data.select(FEATURES__), FEATURES__) \
            .toPandas().values
        model = cls.ica.model
        cls.compo = model.loadings
        cls.W = model.unmixing
        cls.K = model.whitening

        cls.ica.fit(cls._spark_lo)
        cls.fittransform_data = cls.ica.transform(cls._spark_lo)
        cls.fittransform_data = split_vector(
            cls.fittransform_data.data.select(FEATURES__),
            FEATURES__).toPandas().values

        cls.sk_ica = sklearn.decomposition.FastICA(n_components=2,
                                                   algorithm="deflation",
                                                   fun="exp",
                                                   max_iter=5,
                                                   random_state=23)
        cls.sk_fit = cls.sk_ica.fit(cls.X_lo)
        cls.sk_fit.whiten = False
        cls.sk_trans = cls.sk_fit.transform(cls.X_lo)
Esempio n. 3
0
 def write_clusters(self, outpath, suff="", sort_me=True):
     outpath = outpath + "-components" + str(suff)
     logger.info("Writing components to: {}".format(outpath))
     mkdir(outpath)
     data = split_vector(self.data, FEATURES__)
     data = split_vector(data, RESPONSIBILITIES__)
     self._write_clusters(data, outpath, sort_me)
Esempio n. 4
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("PCA")

        cls.X_lo = cls.X()[:10, :]
        # we need to scale this here, because sklearn does not do
        # the scaling for transformations
        cls.X_lo = scale(cls.X_lo)
        df = pandas.DataFrame(data=cls.X_lo, columns=cls.features())
        cls._spark_lo = TestDimredAPI.spark().createDataFrame(df)

        cls.pca = PCA(cls.spark(), 2, cls.features())
        cls.trans = cls.pca.fit_transform(cls._spark_lo)
        cls.trans_panda = split_vector(cls.trans.data.select(FEATURES__),
                                       FEATURES__).toPandas()
        cls.trans = cls.trans_panda.values
        model = cls.pca.model
        cls.loadings = model.loadings
        cls.sds = model.sds

        cls.pca.fit(cls._spark_lo)
        cls.fittransform_trans = cls.pca.transform(cls._spark_lo)
        cls.fittransform_trans = split_vector(
            cls.fittransform_trans.data.select(FEATURES__),
            FEATURES__).toPandas().values

        cls.sk_pca = sklearn.decomposition.PCA(n_components=2)
        cls.sk_pca_trans = cls.sk_pca.fit(cls.X_lo).transform(cls.X_lo)
        k = 2
Esempio n. 5
0
 def _plot(self, outfile):
     logger.info("Plotting")
     subsamp = as_pandas(split_vector(sample(self.data, 10000), FEATURES__))
     for suf in ["png", "pdf", "svg", "eps"]:
         scatter(outfile + "-scatter_plot." + suf, subsamp, "f_0", "f_1",
                 "Factor 1", "Factor 2")
         for i in map(lambda x: "f_" + str(x),
                      range(min(10, self.n_factors))):
             histogram(outfile + "-histogram_{}.".format(i) + suf,
                       subsamp[i].values, i)
Esempio n. 6
0
    def write(self, outpath):
        """
        Write a transformed data set to tsv.

        :param outpath: the path to where the files are written.
        """

        outpath = outpath + "-predicted"
        data = drop(self.data, FEATURES__, RAW_PREDICTION__)
        data = split_vector(data, PROBABILITY__)
        write_tsv(data, outpath)
Esempio n. 7
0
 def test_kpca_fourier(self):
     X = self.kpca._preprocess_data(self._spark_lo)
     X = fourier_transform(X, self.kpca.model.fourier_coefficients,
                           self.kpca.model.fourier_offset)
     df = self.spark().createDataFrame(X.rows.map(lambda x: (x, )))
     df = split_vector(df, "_1").toPandas().values
     for i in range(5):
         ax1 = sorted(df[:, i])
         ax2 = sorted(self._X_transformed[:, i])
         assert numpy.allclose(numpy.absolute(ax1),
                               numpy.absolute(ax2),
                               atol=1e-01)
Esempio n. 8
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("LDA")
        cls.sk_lda = LinearDiscriminantAnalysis(n_components=2, solver="eigen")
        cls.sk_lda_trans = cls.sk_lda.fit(cls.X(), cls.y()).transform(cls.X())

        cls.lda = LDA(cls.spark(), 2, cls.features(), cls.response())
        cls.trans = cls.lda.fit_transform(cls.spark_df())
        model = cls.lda.model
        cls.evec = model.projection

        cls.fit_tran = cls.lda.fit_transform(cls.spark_df())
        cls.fittransform_data = split_vector(
            cls.fit_tran.data.select(FEATURES__), FEATURES__).toPandas().values
Esempio n. 9
0
    def setUpClass(cls):
        super().setUpClass()
        cls.log("KPCA")

        cls.X_lo = cls.X()[:10, :]
        cls.X_lo = scale(cls.X_lo)
        df = pandas.DataFrame(data=cls.X_lo, columns=cls.features())
        cls._spark_lo = TestDimredAPI.spark().createDataFrame(df)

        cls.sbf_feature = sklearn.kernel_approximation.RBFSampler \
            (random_state=23, n_components=5)
        cls._X_transformed = cls.sbf_feature.fit_transform(cls.X_lo)
        cls.sk_pca = PCA(n_components=2).fit(cls._X_transformed)

        cls.kpca = KPCA(cls.spark(), 2, cls.features(), 5, 1.)
        cls.trans = cls.kpca.fit_transform(cls._spark_lo)
        cls.trans = split_vector(cls.trans.data.select(FEATURES__),
                                 FEATURES__).toPandas().values
        model = cls.kpca.model
        cls.evals = model.loadings
        cls.sds = model.sds
        cls.w = model.fourier_coefficients
        cls.b = model.fourier_offset

        cls.kpca.fit(cls._spark_lo)
        cls.fittransform_trans = cls.kpca.transform(cls._spark_lo)
        cls.fittransform_trans = split_vector(
            cls.fittransform_trans.data.select(FEATURES__),
            FEATURES__).toPandas().values

        # The sklearn PCA would substract the mean here
        # We don't want that to happen, but work and the Fourier matrix directly
        # setting the mean to None does the trick
        cls.sk_pca.mean_ = None
        cls.sk_pca.components_ = cls.evals
        cls.sk_pca_trans = cls.sk_pca.transform(cls._X_transformed)
Esempio n. 10
0
 def fit(self, data, outpath=None):
     n, p = dimension(data)
     data = data.select(FEATURES__)
     tot_var = self.tot_var(split_vector(data, FEATURES__), outpath)
     self.model = self._fit(KMeansFitProfile(), outpath, data, n, p, tot_var)
     return self
Esempio n. 11
0
 def test_transform_forest_binomial(self):
     df = split_vector(self.transform_bin.data, PROBABILITY__)
     df = df.toPandas()
     assert "p_0" in df.columns.values
     assert "p_1" in df.columns.values
Esempio n. 12
0
 def write_tsv(self, outfolder):
     data = split_vector(self.data, FEATURES__)
     write_tsv(data, outfolder)
Esempio n. 13
0
 def write_clusters(self, outpath, suff="", sort_me=True):
     outpath = outpath + "-clusters" + str(suff)
     logger.info("Writing clusters to: %s", outpath)
     mkdir(outpath)
     data = split_vector(self.data, FEATURES__)
     self._write_clusters(data, outpath, sort_me)
Esempio n. 14
0
 def test_transform_glm_binomial(self):
     df = split_vector(self.transform_bin.data, PROBABILITY__)
     df = df.toPandas()
     assert "prediction" in df.columns.values