def test_np_spark_compat_frame(self): # Use randomly generated dataFrame pdf = pd.DataFrame(np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"]) pdf2 = pd.DataFrame(np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"]) kdf = ks.from_pandas(pdf) kdf2 = ks.from_pandas(pdf2) for np_name, spark_func in unary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # unary ufunc self.assert_eq(np_func(pdf), np_func(kdf), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e for np_name, spark_func in binary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # binary ufunc self.assert_eq(np_func(pdf, pdf), np_func(kdf, kdf), almost=True) self.assert_eq(np_func(pdf, 1), np_func(kdf, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time. try: set_option("compute.ops_on_diff_frames", True) for np_name, spark_func in list( binary_np_spark_mappings.items())[:5]: np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # binary ufunc self.assert_eq( np_func(pdf, pdf2).sort_index(), np_func(kdf, kdf2).sort_index(), almost=True, ) except Exception as e: raise AssertionError( "Test in '%s' function was failed." % np_name) from e finally: reset_option("compute.ops_on_diff_frames")
def test_np_spark_compat(self): # Use randomly generated dataFrame pdf = pd.DataFrame( np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b']) kdf = ks.from_pandas(pdf) blacklist = [ # Koalas does not currently support "conj", "conjugate", "isnat", "matmul", "frexp", # Values are close enough but tests failed. "arccos", "exp", "expm1", "log", # flaky "log10", # flaky "log1p", # flaky "modf", "floor_divide", # flaky # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow. # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we # can enable it later when Koalas switches to PyArrow 0.15 completely. "left_shift", ] for np_name, spark_func in unary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in blacklist: try: # unary ufunc self.assert_eq(np_func(pdf.a), np_func(kdf.a), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e for np_name, spark_func in binary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in blacklist: try: # binary ufunc self.assert_eq( np_func(pdf.a, pdf.b), np_func(kdf.a, kdf.b), almost=True) self.assert_eq( np_func(pdf.a, 1), np_func(kdf.a, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e