Esempi in Python per DataFrame, esempi in Python per pandas_weld.DataFrame

Esempio n. 1

0

Mostra file

    def test_join_multiindex(self):
        df1 = pdw.DataFrame(
            {'col1': np.arange(8)},
            pdw.MultiIndex.from_product(
                [np.array([1, 2]),
                 np.array([3, 4]),
                 np.array([5, 6])], ['i1', 'i2', 'i3']))
        df2 = pdw.DataFrame(
            {'col2': np.arange(12)},
            pdw.MultiIndex.from_product(
                [np.array([1, 2, 3]),
                 np.array([3, 5]),
                 np.array([5, 6])], ['i1', 'i2', 'i3']))

        result = df1.merge(df2)

        expected_result = pdw.DataFrame(
            {
                'col1': np.array([0, 1, 4, 5]),
                'col2': np.array([0, 1, 4, 5])
            },
            pdw.MultiIndex(
                [np.array([1, 2]),
                 np.array([3, 4]),
                 np.array([5, 6])], [
                     np.array([0, 0, 1, 1]),
                     np.array([0, 0, 0, 0]),
                     np.array([0, 1, 0, 1])
                 ], ['i1', 'i2', 'i3']))

        test_equal_multiindex(expected_result.index, result.index)
        test_equal_series(expected_result['col1'], result['col1'])
        test_equal_series(expected_result['col2'], result['col2'])

Esempio n. 2

0

Mostra file

    def test_groupby_multiple_columns_sum(self):
        df = pdw.DataFrame(
            {
                'col1': np.array([1, 1, 2, 3, 3], dtype=np.int32),
                'col2': np.array([3, 4, 5, 5, 6], dtype=np.int64),
                'col3': np.array([5., 6., 7., 7., 7.], dtype=np.float32)
            },
            pdw.MultiIndex([
                np.array([1, 2, 3], dtype=np.int32),
                np.array([5., 6., 7.], dtype=np.float32)
            ], [
                np.array([0, 0, 1, 2, 2], dtype=np.int64),
                np.array([0, 1, 2, 2, 2], dtype=np.int64)
            ], ['i32', 'f32']))

        result = df.groupby(['col1', 'col3']).sum()

        expected_result = pdw.DataFrame(
            {'col2': np.array([3, 4, 5, 11], dtype=np.int64)},
            pdw.MultiIndex([
                np.array([1, 2, 3], dtype=np.int32),
                np.array([5., 6., 7.], dtype=np.float32)
            ], [
                np.array([0, 0, 1, 2], dtype=np.int64),
                np.array([0, 1, 2, 2], dtype=np.int64)
            ], ['col1', 'col3']))

        # TODO: test equal 1d index method (both rangeindex and index should work)
        # assume correct index values but in different order; just check the values
        levels_result = [
            np.sort(level.evaluate()) for level in result.index.levels
        ]
        labels_result = [
            np.sort(label.evaluate()) for label in result.index.labels
        ]
        levels_expected = [
            np.sort(level) for level in expected_result.index.levels
        ]
        labels_expected = [
            np.sort(label) for label in expected_result.index.labels
        ]
        np.testing.assert_array_equal(result.index.names,
                                      expected_result.index.names)
        for i in range(2):
            np.testing.assert_array_equal(levels_result[i], levels_expected[i])
            np.testing.assert_array_equal(labels_result[i], labels_expected[i])
        # assume correct values but in different order; just check the values
        np.testing.assert_array_equal(
            np.sort(expected_result['col2'].evaluate().data),
            np.sort(result['col2'].evaluate().data))

Esempio n. 3

0

Mostra file

 def setUp(self):
     data = {
         'col1': np.array([1, 2, 3, 4]),
         'col2': np.array([5., 6., 7., 8.])
     }
     index = pdw.MultiIndex.from_product(
         [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
     self.df = pdw.DataFrame(data, index)

Esempio n. 4

0

Mostra file

    def test_join_1d_index(self):
        df1 = pdw.DataFrame({'col1': np.array([1, 2, 3, 4, 5])},
                            pdw.Index(np.array([1, 3, 4, 5, 6]),
                                      np.dtype(np.int64)))
        df2 = pdw.DataFrame({'col2': np.array([1, 2, 3])},
                            pdw.Index(np.array([2, 3, 5]), np.dtype(np.int64)))

        result = df1.merge(df2)

        expected_result = pdw.DataFrame(
            {
                'col1': np.array([2, 4]),
                'col2': np.array([2, 3])
            }, pdw.Index(np.array([3, 5]), np.dtype(np.int64)))

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result.index),
            evaluate_if_necessary(result.index))
        test_equal_series(expected_result['col1'], result['col1'])
        test_equal_series(expected_result['col2'], result['col2'])

Esempio n. 5

0

Mostra file

    def test_drop_list(self):
        data = {}
        index = pdw.MultiIndex.from_product(
            [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
        expected_result = pdw.DataFrame(data, index)

        result = self.df.drop(['col1', 'col2'])

        self.assertListEqual(expected_result.data.keys(), result.data.keys())

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 6

0

Mostra file

    def test_element_wise_operation(self):
        expected_data = {
            'col1': np.array([2, 4, 6, 8]),
            'col2': np.array([10, 12, 14, 16])
        }
        expected_index = pdw.MultiIndex.from_product(
            [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
        expected_result = pdw.DataFrame(expected_data, expected_index)

        data = {'col1': np.array([1, 2, 3, 4]), 'col2': np.array([5, 6, 7, 8])}
        index = pdw.MultiIndex.from_product(
            [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
        result = pdw.DataFrame(data, index) * 2

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col1']),
            evaluate_if_necessary(result['col1']))
        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col2']),
            evaluate_if_necessary(result['col2']))

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 7

0

Mostra file

    def test_drop_str(self):
        data = {'col2': np.array([5., 6., 7., 8.])}
        index = pdw.MultiIndex.from_product(
            [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
        expected_result = pdw.DataFrame(data, index)

        result = self.df.drop('col1')

        self.assertListEqual(expected_result.data.keys(), result.data.keys())
        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col2']),
            evaluate_if_necessary(result['col2']))

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 8

0

Mostra file

    def test_groupby_single_column_sum(self):
        df = pdw.DataFrame(
            {
                'col1': np.array([1, 1, 2, 3, 3], dtype=np.int32),
                'col2': np.array([3, 4, 5, 5, 6], dtype=np.int64),
                'col3': np.array([5., 6., 7., 7., 7.], dtype=np.float32)
            },
            pdw.MultiIndex([
                np.array([1, 2, 3], dtype=np.int32),
                np.array([5., 6., 7.], dtype=np.float32)
            ], [
                np.array([0, 0, 1, 2, 2], dtype=np.int64),
                np.array([0, 1, 2, 2, 2], dtype=np.int64)
            ], ['i32', 'f32']))

        result = df.groupby('col1').sum()

        expected_result = pdw.DataFrame(
            {
                'col2': np.array([7, 5, 11], dtype=np.int64),
                'col3': np.array([11., 7., 14.], dtype=np.float32)
            },
            pdw.Index(np.array([1, 2, 3], dtype=np.int32), np.dtype('int32'),
                      'col1'))

        # TODO: test equal 1d index method (both rangeindex and index should work)
        np.testing.assert_array_equal(
            np.sort(evaluate_if_necessary(expected_result.index)),
            np.sort(evaluate_if_necessary(result.index)))
        # assume correct values but in different order; just check the values
        np.testing.assert_array_equal(
            np.sort(expected_result['col2'].evaluate().data),
            np.sort(result['col2'].evaluate().data))
        np.testing.assert_array_equal(
            np.sort(expected_result['col3'].evaluate().data),
            np.sort(result['col3'].evaluate().data))

Esempio n. 9

0

Mostra file

    def test_agg(self):
        expected_result = pdw.DataFrame(
            {
                'col1': np.array([1, 4], dtype=np.float64),
                'col2': np.array([5, 8], dtype=np.float64)
            },
            pdw.Index(np.array(['min', 'max'], dtype=np.dtype('str')),
                      np.dtype('str')))

        result = self.df.agg(['min', 'max'])

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result.index),
            evaluate_if_necessary(result.index))
        test_equal_series(expected_result['col1'], result['col1'])
        test_equal_series(expected_result['col2'], result['col2'])

Esempio n. 10

0

Mostra file

    def test_getitem_series(self):
        data = {'col1': np.array([1, 2]), 'col2': np.array([5., 6.])}
        index = pdw.MultiIndex(
            [np.array([1, 2]), np.array([3, 4])],
            [np.array([0, 0]), np.array([0, 1])], ['a', 'b'])
        expected_result = pdw.DataFrame(data, index)

        result = self.df[self.df['col1'] < 3]

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col1']),
            evaluate_if_necessary(result['col1']))
        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col2']),
            evaluate_if_necessary(result['col2']))

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 11

0

Mostra file

    def test_reset_index(self):
        result = self.df.reset_index()

        expected_result = pdw.DataFrame(
            {
                'col1': np.array([1, 2, 3, 4]),
                'col2': np.array([5., 6., 7., 8.]),
                'a': np.array([1, 1, 2, 2]),
                'b': np.array([3, 4, 3, 4])
            }, pdw.RangeIndex(0, 4, 1))

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result.index),
            evaluate_if_necessary(result.index))
        test_equal_series(expected_result['col1'], result['col1'])
        test_equal_series(expected_result['col2'], result['col2'])
        test_equal_series(expected_result['a'], result['a'])
        test_equal_series(expected_result['b'], result['b'])

Esempio n. 12

0

Mostra file

File: test_parsers.py Progetto: radujica/data-analysis-pipelines

    def test_read_netcdf4(self):
        data = {
            'tg':
            np.array([
                -99.99, 10., 10.099999, -99.99, -99.99, 10.2, -99.99, -99.99,
                -99.99, 10.3, 10.4, 10.5, 10.599999, 10.7, 10.8, 10.9, -99.99,
                -99.99, -99.99, -99.99, 11., 11., 11., 11., -99.99, -99.99,
                -99.99, -99.99, 12., 13.
            ],
                     dtype=np.float32),
            'tg_ext':
            np.array([
                -9999, 1000., 1010., -9999, -9999, 1020., -9999, -9999, -9999,
                1030., 10401., 10502., 10603., 10704., 10805., 10906., -9999,
                -9999, -9999, -9999, 11001., 11002., 11003., 11004., -9999,
                -9999, -9999, -9999, 12005., 13006.
            ],
                     dtype=np.float32)
        }
        index = pdw.MultiIndex.from_product([
            np.array([25.5, 26.], dtype=np.float32),
            np.array([10., 11., 12.], dtype=np.float32),
            np.array([
                str(date(1950, 1, 1)),
                str(date(1950, 1, 2)),
                str(date(1950, 1, 3)),
                str(date(1950, 1, 4)),
                str(date(1950, 1, 5))
            ])
        ], ['longitude', 'latitude', 'time'])
        expected_result = pdw.DataFrame(data, index)

        result = pdw.read_netcdf4(ParserTests.PATH_EXT)

        self.assertListEqual(expected_result.data.keys(), result.data.keys())
        np.testing.assert_array_equal(
            expected_result.data['tg'],
            result.data['tg'].evaluate(verbose=False))
        np.testing.assert_array_equal(
            expected_result.data['tg_ext'],
            result.data['tg_ext'].evaluate(verbose=False))

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 13

0

Mostra file

    def test_getitem_list(self):
        data = {
            'col1': np.array([1, 2, 3, 4]),
            'col2': np.array([5., 6., 7., 8.])
        }
        index = pdw.MultiIndex.from_product(
            [np.array([1, 2]), np.array([3, 4])], ['a', 'b'])
        expected_result = pdw.DataFrame(data, index)

        result = self.df[['col1', 'col2']]

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col1']),
            evaluate_if_necessary(result['col1']))
        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result['col2']),
            evaluate_if_necessary(result['col2']))

        test_equal_multiindex(expected_result.index, result.index)

Esempio n. 14

0

Mostra file

    def test_describe(self):
        # reversed because of dict and not OrderedDict
        expected_result = pdw.DataFrame(
            {
                'col1': np.array([1, 4, 2.5, 1.29089], np.float64),
                'col2': np.array([5, 8, 6.5, 1.29099], np.float64)
            },
            pdw.Index(np.array(['min', 'max', 'mean', 'std'], dtype=np.str),
                      np.dtype(np.str), "Index"))

        result = self.df.describe(['min', 'max', 'mean', 'std']).evaluate()

        np.testing.assert_array_equal(
            evaluate_if_necessary(expected_result.index),
            evaluate_if_necessary(result.index))
        test_equal_series(expected_result['col1'].evaluate(),
                          result['col1'].evaluate())
        test_equal_series(expected_result['col2'].evaluate(),
                          result['col2'].evaluate())