def test_rolling1(self): # size 3 without unroll def test_impl(n): df = pd.DataFrame({'A': np.arange(n), 'B': np.random.ranf(n)}) Ac = df.A.rolling(3).sum() return Ac.sum() hpat_func = hpat.jit(test_impl) n = 121 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) # size 7 with unroll def test_impl_2(n): df = pd.DataFrame({'A': np.arange(n)+1.0, 'B': np.random.ranf(n)}) Ac = df.A.rolling(7).sum() return Ac.sum() hpat_func = hpat.jit(test_impl) n = 121 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_series_fusion2(self): # make sure getting data var avoids incorrect single def assumption def test_impl(A, B): S = B + 2 if A[0] == 0: S = A + 1 return S + B n = 11 A = pd.Series(np.arange(n)) B = pd.Series(np.arange(n)**2) hpat_func = hpat.jit(test_impl) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B)) self.assertEqual(count_parfor_REPs(), 3)
def test_concat_series(self): def test_impl(n): df1 = pd.DataFrame({'key1': np.arange(n), 'A': np.arange(n)+1.0}) df2 = pd.DataFrame({'key2': n-np.arange(n), 'A': n+np.arange(n)+1.0}) A3 = pd.concat([df1.A, df2.A]) return A3.sum() hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) n = 11111 self.assertEqual(hpat_func(n), test_impl(n))
def test_cumsum(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.random.ranf(n)}) Ac = df.A.cumsum() return Ac.sum() hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_array_OneDs(), 2) self.assertEqual(count_parfor_REPs(), 0) self.assertEqual(count_parfor_OneDs(), 2) self.assertTrue(dist_IR_contains('dist_cumsum'))
def test_join1(self): def test_impl(n): df1 = pd.DataFrame({'key1': np.arange(n)+3, 'A': np.arange(n)+1.0}) df2 = pd.DataFrame({'key2': 2*np.arange(n)+1, 'B': n+np.arange(n)+1.0}) df3 = pd.merge(df1, df2, left_on='key1', right_on='key2') return df3.B.sum() hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) n = 11111 self.assertEqual(hpat_func(n), test_impl(n))
def test_shift1(self): def test_impl(n): df = pd.DataFrame({ 'A': np.arange(n) + 1.0, 'B': np.random.ranf(n) }) Ac = df.A.shift(1) return Ac.sum() hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_df_describe(self): def test_impl(n): df = pd.DataFrame({ 'A': np.arange(0, n, 1, np.float32), 'B': np.arange(n) }) #df.A[0:1] = np.nan return df.describe() hpat_func = hpat.jit(test_impl) n = 1001 hpat_func(n) # XXX: test actual output self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_str_split_parallel(self): def test_impl(df): B = df.A.str.split(',') return B n = 5 start, end = get_start_end(n) A = ['AB,CC', 'C,ABB,D', 'CAD', 'CA,D', 'AA,,D'] df = pd.DataFrame({'A': A[start:end]}) hpat_func = hpat.jit(distributed={'df', 'B'})(test_impl) pd.testing.assert_series_equal(hpat_func(df), test_impl(df), check_names=False) self.assertEqual(count_array_REPs(), 3) self.assertEqual(count_parfor_REPs(), 0)
def test_df_input_dist1(self): def test_impl(df): return df.B.sum() n = 121 A = [3, 4, 5, 6, 1] B = [5, 6, 2, 1, 3] n = 5 start, end = get_start_end(n) df = pd.DataFrame({'A': A, 'B': B}) df_h = pd.DataFrame({'A': A[start:end], 'B': B[start:end]}) hpat_func = hpat.jit(distributed={'df'})(test_impl) np.testing.assert_almost_equal(hpat_func(df_h), test_impl(df)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_str_replace_regex_parallel(self): def test_impl(df): B = df.A.str.replace('AB*', 'EE', regex=True) return B n = 5 A = ['ABCC', 'CABBD', 'CCD', 'CCDAABB', 'ED'] start, end = get_start_end(n) df = pd.DataFrame({'A': A[start:end]}) hpat_func = hpat.jit(distributed={'df', 'B'})(test_impl) pd.testing.assert_series_equal(hpat_func(df), test_impl(df), check_names=False) self.assertEqual(count_array_REPs(), 3) self.assertEqual(count_parfor_REPs(), 0)
def test_fixed_parallel_apply1(self): def test_impl(n, w, center): df = pd.DataFrame({'B': np.arange(n)}) R = df.rolling(w, center=center).apply(lambda a: a.sum()) return R.B.sum() hpat_func = hpat.jit(test_impl) sizes = (1, 2, 10, 11, 121, 1000) wins = (2, 4, 5, 10, 11) centers = (False, True) for args in itertools.product(sizes, wins, centers): self.assertEqual(hpat_func(*args), test_impl(*args), "rolling fixed window with {}".format(args)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_set_column1(self): # set existing column def test_impl(n): df = pd.DataFrame({ 'A': np.ones(n, np.int64), 'B': np.random.ranf(n) }) df['A'] = np.arange(n) return df.A.sum() hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) self.assertEqual(count_parfor_OneDs(), 1)
def test_write_csv_parallel1(self): def test_impl(n, fname): df = pd.DataFrame({'A': np.arange(n)}) df.to_csv(fname) hpat_func = hpat.jit(test_impl) n = 111 hp_fname = 'test_write_csv1_hpat_par.csv' pd_fname = 'test_write_csv1_pd_par.csv' hpat_func(n, hp_fname) test_impl(n, pd_fname) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) # TODO: delete files if get_rank() == 0: pd.testing.assert_frame_equal(pd.read_csv(hp_fname), pd.read_csv(pd_fname))
def test_column_distribution(self): # make sure all column calls are distributed def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.random.ranf(n)}) df.A.fillna(5.0, inplace=True) DF = df.A.fillna(5.0) s = DF.sum() m = df.A.mean() v = df.A.var() t = df.A.std() Ac = df.A.cumsum() return Ac.sum() + s + m + v + t hpat_func = hpat.jit(test_impl) n = 11 self.assertEqual(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0) self.assertTrue(dist_IR_contains('dist_cumsum'))
def test_variable_apply_parallel1(self): wins = ('1s', '2s', '3s', '4s') # XXX: Pandas returns time = [np.nan] for size==1 for some reason sizes = (2, 10, 11, 121, 1000) # all functions except apply for w in wins: func_text = "def test_impl(n):\n" func_text += " df = pd.DataFrame({'B': np.arange(n), 'time': " func_text += " pd.DatetimeIndex(np.arange(n) * 1000000000)})\n" func_text += " res = df.rolling('{}', on='time').apply(lambda a: a.sum())\n".format( w) func_text += " return res.B.sum()\n" loc_vars = {} exec(func_text, {'pd': pd, 'np': np}, loc_vars) test_impl = loc_vars['test_impl'] hpat_func = hpat.jit(test_impl) for n in sizes: np.testing.assert_almost_equal(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_reduce(self): dtypes = ['float32', 'float64', 'int32', 'int64'] funcs = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax'] for (dtype, func) in itertools.product(dtypes, funcs): # loc allreduce doesn't support int64 if dtype=='int64' and func in ['argmin', 'argmax']: continue func_text = """def f(n): A = np.ones(n, dtype=np.{}) return A.{}() """.format(dtype, func) loc_vars = {} exec(func_text, {'np': np}, loc_vars) test_impl = loc_vars['f'] hpat_func = hpat.jit(test_impl) n = 128 np.testing.assert_almost_equal(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_reduce(self): import sys dtypes = ['float32', 'float64', 'int32', 'int64'] funcs = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax'] for (dtype, func) in itertools.product(dtypes, funcs): # loc allreduce doesn't support int64 on windows if (sys.platform.startswith('win') and dtype == 'int64' and func in ['argmin', 'argmax']): continue func_text = """def f(n): A = np.arange(0, n, 1, np.{}) return A.{}() """.format(dtype, func) loc_vars = {} exec(func_text, {'np': np}, loc_vars) test_impl = loc_vars['f'] hpat_func = hpat.jit(test_impl) n = 21 # XXX arange() on float32 has overflow issues on large n np.testing.assert_almost_equal(hpat_func(n), test_impl(n)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_join_datetime_parallel1(self): def test_impl(df1, df2): df3 = pd.merge(df1, df2, on='time') return (df3.A.sum(), df3.time.max(), df3.B.sum()) hpat_func = hpat.jit(distributed=['df1', 'df2'])(test_impl) df1 = pd.DataFrame({ 'time': pd.DatetimeIndex(['2017-01-03', '2017-01-06', '2017-02-21']), 'B': [4, 5, 6] }) df2 = pd.DataFrame({ 'time': pd.DatetimeIndex(['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9] }) start1, end1 = get_start_end(len(df1)) start2, end2 = get_start_end(len(df2)) self.assertEqual( hpat_func(df1.iloc[start1:end1], df2.iloc[start2:end2]), test_impl(df1, df2)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)