def test_groupby_multiple_columns_sum(self): df = pdw.DataFrame( { 'col1': np.array([1, 1, 2, 3, 3], dtype=np.int32), 'col2': np.array([3, 4, 5, 5, 6], dtype=np.int64), 'col3': np.array([5., 6., 7., 7., 7.], dtype=np.float32) }, pdw.MultiIndex([ np.array([1, 2, 3], dtype=np.int32), np.array([5., 6., 7.], dtype=np.float32) ], [ np.array([0, 0, 1, 2, 2], dtype=np.int64), np.array([0, 1, 2, 2, 2], dtype=np.int64) ], ['i32', 'f32'])) result = df.groupby(['col1', 'col3']).sum() expected_result = pdw.DataFrame( {'col2': np.array([3, 4, 5, 11], dtype=np.int64)}, pdw.MultiIndex([ np.array([1, 2, 3], dtype=np.int32), np.array([5., 6., 7.], dtype=np.float32) ], [ np.array([0, 0, 1, 2], dtype=np.int64), np.array([0, 1, 2, 2], dtype=np.int64) ], ['col1', 'col3'])) # TODO: test equal 1d index method (both rangeindex and index should work) # assume correct index values but in different order; just check the values levels_result = [ np.sort(level.evaluate()) for level in result.index.levels ] labels_result = [ np.sort(label.evaluate()) for label in result.index.labels ] levels_expected = [ np.sort(level) for level in expected_result.index.levels ] labels_expected = [ np.sort(label) for label in expected_result.index.labels ] np.testing.assert_array_equal(result.index.names, expected_result.index.names) for i in range(2): np.testing.assert_array_equal(levels_result[i], levels_expected[i]) np.testing.assert_array_equal(labels_result[i], labels_expected[i]) # assume correct values but in different order; just check the values np.testing.assert_array_equal( np.sort(expected_result['col2'].evaluate().data), np.sort(result['col2'].evaluate().data))
def test_join_multiindex(self): df1 = pdw.DataFrame( {'col1': np.arange(8)}, pdw.MultiIndex.from_product( [np.array([1, 2]), np.array([3, 4]), np.array([5, 6])], ['i1', 'i2', 'i3'])) df2 = pdw.DataFrame( {'col2': np.arange(12)}, pdw.MultiIndex.from_product( [np.array([1, 2, 3]), np.array([3, 5]), np.array([5, 6])], ['i1', 'i2', 'i3'])) result = df1.merge(df2) expected_result = pdw.DataFrame( { 'col1': np.array([0, 1, 4, 5]), 'col2': np.array([0, 1, 4, 5]) }, pdw.MultiIndex( [np.array([1, 2]), np.array([3, 4]), np.array([5, 6])], [ np.array([0, 0, 1, 1]), np.array([0, 0, 0, 0]), np.array([0, 1, 0, 1]) ], ['i1', 'i2', 'i3'])) test_equal_multiindex(expected_result.index, result.index) test_equal_series(expected_result['col1'], result['col1']) test_equal_series(expected_result['col2'], result['col2'])
def test_from_arrays(self): arrays = [np.array([1, 1, 2, 2]), np.array([3, 4, 3, 4])] names = ['a', 'b'] result = pdw.MultiIndex.from_arrays(arrays, names) expected_result = pdw.MultiIndex( [np.array([1, 2]), np.array([3, 4])], [np.array([0, 0, 1, 1]), np.array([0, 1, 0, 1])], ['a', 'b']) test_equal_multiindex(expected_result, result)
def test_from_product_raw(self): levels = [np.array([1, 2]), np.array([3, 4])] names = ['a', 'b'] result = pdw.MultiIndex.from_product(levels, names) expected_result = pdw.MultiIndex( [np.array([1, 2]), np.array([3, 4])], [np.array([0, 0, 1, 1]), np.array([0, 1, 0, 1])], ['a', 'b']) test_equal_multiindex(expected_result, result)
def test_getitem_series(self): data = {'col1': np.array([1, 2]), 'col2': np.array([5., 6.])} index = pdw.MultiIndex( [np.array([1, 2]), np.array([3, 4])], [np.array([0, 0]), np.array([0, 1])], ['a', 'b']) expected_result = pdw.DataFrame(data, index) result = self.df[self.df['col1'] < 3] np.testing.assert_array_equal( evaluate_if_necessary(expected_result['col1']), evaluate_if_necessary(result['col1'])) np.testing.assert_array_equal( evaluate_if_necessary(expected_result['col2']), evaluate_if_necessary(result['col2'])) test_equal_multiindex(expected_result.index, result.index)
def test_from_product(self): levels = [ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ] names = ['a', 'b'] result = pdw.MultiIndex.from_product(levels, names) expected_result = pdw.MultiIndex([ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ], [ LazyResult(np.array([0, 0, 1, 1]), WeldLong(), 1), LazyResult(np.array([0, 1, 0, 1]), WeldLong(), 1) ], ['a', 'b']) test_equal_multiindex(expected_result, result)
def test_getitem_filter(self): levels = [ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ] names = ['a', 'b'] to_filter = LazyResult( np.array([True, False, True, False], dtype=np.bool), numpy_to_weld_type(np.dtype(np.bool)), 1) result = pdw.MultiIndex.from_product(levels, names)[to_filter] expected_result = pdw.MultiIndex([ LazyResult(np.array([1, 2]), WeldLong(), 1), LazyResult(np.array([3, 4]), WeldLong(), 1) ], [ LazyResult(np.array([0, 1]), WeldLong(), 1), LazyResult(np.array([0, 0]), WeldLong(), 1) ], ['a', 'b']) test_equal_multiindex(expected_result, result)
def test_groupby_single_column_sum(self): df = pdw.DataFrame( { 'col1': np.array([1, 1, 2, 3, 3], dtype=np.int32), 'col2': np.array([3, 4, 5, 5, 6], dtype=np.int64), 'col3': np.array([5., 6., 7., 7., 7.], dtype=np.float32) }, pdw.MultiIndex([ np.array([1, 2, 3], dtype=np.int32), np.array([5., 6., 7.], dtype=np.float32) ], [ np.array([0, 0, 1, 2, 2], dtype=np.int64), np.array([0, 1, 2, 2, 2], dtype=np.int64) ], ['i32', 'f32'])) result = df.groupby('col1').sum() expected_result = pdw.DataFrame( { 'col2': np.array([7, 5, 11], dtype=np.int64), 'col3': np.array([11., 7., 14.], dtype=np.float32) }, pdw.Index(np.array([1, 2, 3], dtype=np.int32), np.dtype('int32'), 'col1')) # TODO: test equal 1d index method (both rangeindex and index should work) np.testing.assert_array_equal( np.sort(evaluate_if_necessary(expected_result.index)), np.sort(evaluate_if_necessary(result.index))) # assume correct values but in different order; just check the values np.testing.assert_array_equal( np.sort(expected_result['col2'].evaluate().data), np.sort(result['col2'].evaluate().data)) np.testing.assert_array_equal( np.sort(expected_result['col3'].evaluate().data), np.sort(result['col3'].evaluate().data))