Beispiel #1
0
    def test_agg_multikey_parallel(self):
        def test_impl(in_A, in_B, in_C):
            df = pd.DataFrame({'A': in_A, 'B': in_B, 'C': in_C})
            A = df.groupby(['A', 'C'])['B'].sum()
            return A.sum()

        hpat_func = hpat.jit(
            locals={
                'in_A:input': 'distributed',
                'in_B:input': 'distributed',
                'in_C:input': 'distributed'
            })(test_impl)
        df = pd.DataFrame({
            'A': [2, 1, 1, 1, 2, 2, 1],
            'B': [-8, 2, 3, 1, 5, 6, 7],
            'C': [3, 5, 6, 5, 4, 4, 3]
        })
        start, end = get_start_end(len(df))
        h_A = df.A.values[start:end]
        h_B = df.B.values[start:end]
        h_C = df.C.values[start:end]
        p_A = df.A.values
        p_B = df.B.values
        p_C = df.C.values
        h_res = hpat_func(h_A, h_B, h_C)
        p_res = test_impl(p_A, p_B, p_C)
        self.assertEqual(h_res, p_res)
Beispiel #2
0
    def test_reduce_filter1(self):
        import sys
        dtypes = ['float32', 'float64', 'int32', 'int64']
        funcs = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax']
        for (dtype, func) in itertools.product(dtypes, funcs):
            # loc allreduce doesn't support int64 on windows
            if (sys.platform.startswith('win')
                    and dtype == 'int64'
                    and func in ['argmin', 'argmax']):
                continue
            func_text = """def f(A):
                A = A[A>5]
                return A.{}()
            """.format(func)
            loc_vars = {}
            exec(func_text, {'np': np}, loc_vars)
            test_impl = loc_vars['f']

            hpat_func = hpat.jit(locals={'A:input': 'distributed'})(test_impl)
            n = 21
            start, end = get_start_end(n)
            np.random.seed(0)
            A = np.random.randint(0, 10, n).astype(dtype)
            np.testing.assert_almost_equal(
                hpat_func(A[start:end]), test_impl(A), decimal=3,
                err_msg="{} on {}".format(func, dtype))
            self.assertEqual(count_array_REPs(), 0)
            self.assertEqual(count_parfor_REPs(), 0)
Beispiel #3
0
    def test_series_head_index_parallel1(self):
        def test_impl(S):
            return S.head(3)

        S = pd.Series([6,9,2,3,6,4,5], ['a','ab','abc','c','f','hh',''])
        hpat_func = hpat.jit(distributed={'S'})(test_impl)
        start, end = get_start_end(len(S))
        pd.testing.assert_series_equal(hpat_func(S[start:end]), test_impl(S))
        self.assertTrue(count_array_OneDs()>0)
Beispiel #4
0
    def test_series_dropna_str_parallel1(self):
        def test_impl(A):
            B = A.dropna()
            return (B == 'gg').sum()
        hpat_func = hpat.jit(distributed=['A'])(test_impl)

        S1 = pd.Series(['aa', 'b', None, 'ccc', 'dd', 'gg'])
        start, end = get_start_end(len(S1))
        # TODO: gatherv
        self.assertEqual(hpat_func(S1[start:end]), test_impl(S1))
Beispiel #5
0
    def test_series_head_index_parallel1(self):
        '''Verifies head method for distributed Series with integer index'''
        def test_impl(S):
            return S.head(3)
        hpat_func = hpat.jit(distributed={'S'})(test_impl)

        S = pd.Series([6, 9, 2, 3, 6, 4, 5], [8, 1, 6, 0, 9, 1, 3])
        start, end = get_start_end(len(S))
        pd.testing.assert_series_equal(hpat_func(S[start:end]), test_impl(S))
        self.assertTrue(count_array_OneDs() > 0)
Beispiel #6
0
    def test_series_dist_input1(self):
        def test_impl(S):
            return S.max()
        hpat_func = hpat.jit(distributed={'S'})(test_impl)

        n = 111
        S = pd.Series(np.arange(n))
        start, end = get_start_end(n)
        self.assertEqual(hpat_func(S[start:end]), test_impl(S))
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #7
0
    def test_series_tuple_input_dist1(self):
        def test_impl(s_tup):
            return s_tup[0].max()
        hpat_func = hpat.jit(locals={'s_tup:input': 'distributed'})(test_impl)

        n = 111
        S = pd.Series(np.arange(n))
        S2 = pd.Series(np.arange(n)+1.0)
        start, end = get_start_end(n)
        s_tup = (S, 1, S2)
        h_s_tup = (S[start:end], 1, S2[start:end])
        self.assertEqual(hpat_func(h_s_tup), test_impl(s_tup))
Beispiel #8
0
    def test_h5_filter(self):
        def test_impl():
            f = h5py.File("h5_test_filter.h5", "r")
            b = np.arange(11) % 3 == 0
            X = f['test'][b, :, :, :]
            f.close()
            return X

        hpat_func = hpat.jit(locals={'X:return': 'distributed'})(test_impl)
        n = 4  # len(test_impl())
        start, end = get_start_end(n)
        np.testing.assert_allclose(hpat_func(), test_impl()[start:end])
Beispiel #9
0
    def test_series_dist_input3(self):
        '''Verify distribution of a Series with string index'''
        def test_impl(S):
            return S.max()
        hpat_func = hpat.jit(distributed={'S'})(test_impl)

        n = 111
        S = pd.Series(np.arange(n), ['abc{}'.format(id) for id in range(n)])
        start, end = get_start_end(n)
        self.assertEqual(hpat_func(S[start:end]), test_impl(S))
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #10
0
    def test_join_datetime_parallel1(self):
        def test_impl(df1, df2):
            df3 = pd.merge(df1, df2, on='time')
            return (df3.A.sum(), df3.time.max(), df3.B.sum())

        hpat_func = hpat.jit(distributed=['df1', 'df2'])(test_impl)
        df1 = pd.DataFrame({
            'time':
            pd.DatetimeIndex(['2017-01-03', '2017-01-06', '2017-02-21']),
            'B': [4, 5, 6]
        })
        df2 = pd.DataFrame({
            'time':
            pd.DatetimeIndex(['2017-01-01', '2017-01-06', '2017-01-03']),
            'A': [7, 8, 9]
        })
        start1, end1 = get_start_end(len(df1))
        start2, end2 = get_start_end(len(df2))
        self.assertEqual(
            hpat_func(df1.iloc[start1:end1], df2.iloc[start2:end2]),
            test_impl(df1, df2))
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #11
0
    def test_series_head_parallel1(self):
        '''Verifies head method for distributed Series with string data and no index'''
        def test_impl(S):
            return S.head(7)

        hpat_func = hpat.jit(distributed={'S'})(test_impl)

        # need to test different lenghts, as head's size is fixed and implementation
        # depends on relation of size of the data per processor to output data size
        for n in range(1, 5):
            S = pd.Series(['a', 'ab', 'abc', 'c', 'f', 'hh', ''] * n)
            start, end = get_start_end(len(S))
            pd.testing.assert_series_equal(hpat_func(S[start:end]), test_impl(S))
            self.assertTrue(count_array_OneDs() > 0)
Beispiel #12
0
    def test_df_input_dist1(self):
        def test_impl(df):
            return df.B.sum()

        n = 121
        A = [3, 4, 5, 6, 1]
        B = [5, 6, 2, 1, 3]
        n = 5
        start, end = get_start_end(n)
        df = pd.DataFrame({'A': A, 'B': B})
        df_h = pd.DataFrame({'A': A[start:end], 'B': B[start:end]})
        hpat_func = hpat.jit(distributed={'df'})(test_impl)
        np.testing.assert_almost_equal(hpat_func(df_h), test_impl(df))
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #13
0
    def test_str_split_parallel(self):
        def test_impl(df):
            B = df.A.str.split(',')
            return B

        n = 5
        start, end = get_start_end(n)
        A = ['AB,CC', 'C,ABB,D', 'CAD', 'CA,D', 'AA,,D']
        df = pd.DataFrame({'A': A[start:end]})
        hpat_func = hpat.jit(distributed={'df', 'B'})(test_impl)
        pd.testing.assert_series_equal(hpat_func(df),
                                       test_impl(df),
                                       check_names=False)
        self.assertEqual(count_array_REPs(), 3)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #14
0
    def test_str_replace_regex_parallel(self):
        def test_impl(df):
            B = df.A.str.replace('AB*', 'EE', regex=True)
            return B

        n = 5
        A = ['ABCC', 'CABBD', 'CCD', 'CCDAABB', 'ED']
        start, end = get_start_end(n)
        df = pd.DataFrame({'A': A[start:end]})
        hpat_func = hpat.jit(distributed={'df', 'B'})(test_impl)
        pd.testing.assert_series_equal(hpat_func(df),
                                       test_impl(df),
                                       check_names=False)
        self.assertEqual(count_array_REPs(), 3)
        self.assertEqual(count_parfor_REPs(), 0)
Beispiel #15
0
    def test_join_left_parallel1(self):
        """
        """
        def test_impl(A1, B1, C1, A2, B2, D2):
            df1 = pd.DataFrame({'A': A1, 'B': B1, 'C': C1})
            df2 = pd.DataFrame({'A': A2, 'B': B2, 'D': D2})
            df3 = df1.merge(df2, on=('A', 'B'))
            return df3.C.sum() + df3.D.sum()

        hpat_func = hpat.jit(
            locals={
                'A1:input': 'distributed',
                'B1:input': 'distributed',
                'C1:input': 'distributed',
            })(test_impl)
        df1 = pd.DataFrame({
            'A': [3, 1, 1, 3, 4],
            'B': [1, 2, 3, 2, 3],
            'C': [7, 8, 9, 4, 5]
        })

        df2 = pd.DataFrame({
            'A': [2, 1, 4, 4, 3],
            'B': [1, 3, 2, 3, 2],
            'D': [1, 2, 3, 4, 8]
        })

        start, end = get_start_end(len(df1))
        h_A1 = df1.A.values[start:end]
        h_B1 = df1.B.values[start:end]
        h_C1 = df1.C.values[start:end]
        h_A2 = df2.A.values
        h_B2 = df2.B.values
        h_D2 = df2.D.values
        p_A1 = df1.A.values
        p_B1 = df1.B.values
        p_C1 = df1.C.values
        p_A2 = df2.A.values
        p_B2 = df2.B.values
        p_D2 = df2.D.values
        h_res = hpat_func(h_A1, h_B1, h_C1, h_A2, h_B2, h_D2)
        p_res = test_impl(p_A1, p_B1, p_C1, p_A2, p_B2, p_D2)
        self.assertEqual(h_res, p_res)
        self.assertEqual(count_array_OneDs(), 3)
Beispiel #16
0
    def test_var_dist1(self):
        def test_impl(A, B):
            df = pd.DataFrame({'A': A, 'B': B})
            df2 = df.groupby('A', as_index=False)['B'].sum()
            # TODO: fix handling of df setitem to force match of array dists
            # probably with a new node that is appended to the end of basic block
            # df2['C'] = np.full(len(df2.B), 3, np.int8)
            # TODO: full_like for Series
            df2['C'] = np.full_like(df2.B.values, 3, np.int8)
            return df2

        A = np.array([1, 1, 2, 3])
        B = np.array([3, 4, 5, 6])
        hpat_func = hpat.jit(
            locals={
                'A:input': 'distributed',
                'B:input': 'distributed',
                'df2:return': 'distributed'
            })(test_impl)
        start, end = get_start_end(len(A))
        df2 = hpat_func(A[start:end], B[start:end])