Exemple #1
0
    def test_np_spark_compat_frame(self):
        # Use randomly generated dataFrame
        pdf = pd.DataFrame(np.random.randint(-100,
                                             100,
                                             size=(np.random.randint(100), 2)),
                           columns=["a", "b"])
        pdf2 = pd.DataFrame(np.random.randint(-100,
                                              100,
                                              size=(len(pdf),
                                                    len(pdf.columns))),
                            columns=["a", "b"])
        kdf = ks.from_pandas(pdf)
        kdf2 = ks.from_pandas(pdf2)

        for np_name, spark_func in unary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in self.blacklist:
                try:
                    # unary ufunc
                    self.assert_eq(np_func(pdf), np_func(kdf), almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." %
                                         np_name) from e

        for np_name, spark_func in binary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in self.blacklist:
                try:
                    # binary ufunc
                    self.assert_eq(np_func(pdf, pdf),
                                   np_func(kdf, kdf),
                                   almost=True)
                    self.assert_eq(np_func(pdf, 1),
                                   np_func(kdf, 1),
                                   almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." %
                                         np_name) from e

        # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
        try:
            set_option("compute.ops_on_diff_frames", True)
            for np_name, spark_func in list(
                    binary_np_spark_mappings.items())[:5]:
                np_func = getattr(np, np_name)
                if np_name not in self.blacklist:
                    try:
                        # binary ufunc
                        self.assert_eq(
                            np_func(pdf, pdf2).sort_index(),
                            np_func(kdf, kdf2).sort_index(),
                            almost=True,
                        )

                    except Exception as e:
                        raise AssertionError(
                            "Test in '%s' function was failed." %
                            np_name) from e
        finally:
            reset_option("compute.ops_on_diff_frames")
Exemple #2
0
    def test_np_spark_compat(self):
        # Use randomly generated dataFrame
        pdf = pd.DataFrame(
            np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b'])
        kdf = ks.from_pandas(pdf)

        blacklist = [
            # Koalas does not currently support
            "conj",
            "conjugate",
            "isnat",
            "matmul",
            "frexp",

            # Values are close enough but tests failed.
            "arccos",
            "exp",
            "expm1",
            "log",  # flaky
            "log10",  # flaky
            "log1p",  # flaky
            "modf",
            "floor_divide",  # flaky

            # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow.
            # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we
            # can enable it later when Koalas switches to PyArrow 0.15 completely.
            "left_shift",
        ]

        for np_name, spark_func in unary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in blacklist:
                try:
                    # unary ufunc
                    self.assert_eq(np_func(pdf.a), np_func(kdf.a), almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." % np_name) from e

        for np_name, spark_func in binary_np_spark_mappings.items():
            np_func = getattr(np, np_name)
            if np_name not in blacklist:
                try:
                    # binary ufunc
                    self.assert_eq(
                        np_func(pdf.a, pdf.b), np_func(kdf.a, kdf.b), almost=True)
                    self.assert_eq(
                        np_func(pdf.a, 1), np_func(kdf.a, 1), almost=True)
                except Exception as e:
                    raise AssertionError("Test in '%s' function was failed." % np_name) from e