コード例 #1
0
    def test_pivot_table(self):
        index = ['A', 'B']
        columns = 'C'
        table = pivot_table(self.data,
                            values='D',
                            index=index,
                            columns=columns)

        table2 = self.data.pivot_table(values='D',
                                       index=index,
                                       columns=columns)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', index=index)

        if len(index) > 1:
            self.assertEqual(table.index.names, tuple(index))
        else:
            self.assertEqual(table.index.name, index[0])

        if len(columns) > 1:
            self.assertEqual(table.columns.names, columns)
        else:
            self.assertEqual(table.columns.name, columns[0])

        expected = self.data.groupby(index + [columns])['D'].agg(
            np.mean).unstack()
        tm.assert_frame_equal(table, expected)
コード例 #2
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_pivot_datetime_tz(self):
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': range(6), 'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
                                    '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1')
        exp_col1 = Index(['value1', 'value1'])
        exp_col2 = Index(['a', 'b'], name='label')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]],
                             index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1'])
        tm.assert_frame_equal(result, expected)


        exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean'])
        exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
        exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4,
                                    tz='Asia/Tokyo', name='dt2')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1],
                              [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col)

        result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'],
                             aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
コード例 #3
0
    def test_pivot_table(self):
        index = ['A', 'B']
        columns = 'C'
        table = pivot_table(self.data, values='D',
                            index=index, columns=columns)

        table2 = self.data.pivot_table(
            values='D', index=index, columns=columns)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', index=index)

        if len(index) > 1:
            self.assertEqual(table.index.names, tuple(index))
        else:
            self.assertEqual(table.index.name, index[0])

        if len(columns) > 1:
            self.assertEqual(table.columns.names, columns)
        else:
            self.assertEqual(table.columns.name, columns[0])

        expected = self.data.groupby(
            index + [columns])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)
コード例 #4
0
    def test_pivot_datetime_tz(self):
        dates1 = [
            '2011-07-19 07:00:00', '2011-07-19 08:00:00',
            '2011-07-19 09:00:00', '2011-07-19 07:00:00',
            '2011-07-19 08:00:00', '2011-07-19 09:00:00'
        ]
        dates2 = [
            '2013-01-01 15:00:00', '2013-01-01 15:00:00',
            '2013-01-01 15:00:00', '2013-02-01 15:00:00',
            '2013-02-01 15:00:00', '2013-02-01 15:00:00'
        ]
        df = DataFrame({
            'label': ['a', 'a', 'a', 'b', 'b', 'b'],
            'dt1': dates1,
            'dt2': dates2,
            'value1': np.arange(6, dtype='int64'),
            'value2': [1, 2] * 3
        })
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_idx = pd.DatetimeIndex([
            '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'
        ],
                                   tz='US/Pacific',
                                   name='dt1')
        exp_col1 = Index(['value1', 'value1'])
        exp_col2 = Index(['a', 'b'], name='label')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]],
                             index=exp_idx,
                             columns=exp_col)
        result = pivot_table(df,
                             index=['dt1'],
                             columns=['label'],
                             values=['value1'])
        tm.assert_frame_equal(result, expected)

        exp_col1 = Index(
            ['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean'])
        exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
        exp_col3 = pd.DatetimeIndex(
            ['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4,
            tz='Asia/Tokyo',
            name='dt2')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(np.array(
            [[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1],
             [2, 5, 1, 2, 2, 5, 1, 2]],
            dtype='int64'),
                             index=exp_idx,
                             columns=exp_col)

        result = pivot_table(df,
                             index=['dt1'],
                             columns=['dt2'],
                             values=['value1', 'value2'],
                             aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
コード例 #5
0
    def test_pivot_dtaccessor(self):
        # GH 8103
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
                  '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00',
                  '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00',
                  '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': np.arange(6, dtype='int64'),
                        'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d))

        result = pivot_table(df, index='label', columns=df['dt1'].dt.hour,
                             values='value1')

        exp_idx = Index(['a', 'b'], name='label')
        expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
                             index=exp_idx,
                             columns=Index([7, 8, 9], name='dt1'))
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=df['dt2'].dt.month,
                             columns=df['dt1'].dt.hour,
                             values='value1')

        expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
                             index=Index([1, 2], name='dt2'),
                             columns=Index([7, 8, 9], name='dt1'))
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=df['dt2'].dt.year.values,
                             columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
                             values='value1')

        exp_col = MultiIndex.from_arrays(
            [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2'])
        expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'),
                             index=[2013], columns=exp_col)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=np.array(['X', 'X', 'X',
                                                 'X', 'Y', 'Y']),
                             columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
                             values='value1')
        expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan],
                                       [np.nan, np.nan, np.nan,
                                        4, np.nan, 5]]),
                             index=['X', 'Y'], columns=exp_col)
        tm.assert_frame_equal(result, expected)
コード例 #6
0
 def test_pivot_multi_values(self):
     result = pivot_table(self.data,
                          values=['D', 'E'],
                          index='A',
                          columns=['B', 'C'],
                          fill_value=0)
     expected = pivot_table(self.data.drop(['F'], axis=1),
                            index='A',
                            columns=['B', 'C'],
                            fill_value=0)
     tm.assert_frame_equal(result, expected)
コード例 #7
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_pivot_table_warnings(self):
        index = ['A', 'B']
        columns = 'C'
        with tm.assert_produces_warning(FutureWarning):
            table = pivot_table(self.data, values='D', rows=index,
                                cols=columns)

        with tm.assert_produces_warning(False):
            table2 = pivot_table(self.data, values='D', index=index,
                                 columns=columns)

        tm.assert_frame_equal(table, table2)
コード例 #8
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_datetime_tz(self):
        dates1 = [
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
        ]
        dates2 = [
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
        ]
        df = DataFrame(
            {
                "label": ["a", "a", "a", "b", "b", "b"],
                "dt1": dates1,
                "dt2": dates2,
                "value1": np.arange(6, dtype="int64"),
                "value2": [1, 2] * 3,
            }
        )
        df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific"))
        df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo"))

        exp_idx = pd.DatetimeIndex(
            ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], tz="US/Pacific", name="dt1"
        )
        exp_col1 = Index(["value1", "value1"])
        exp_col2 = Index(["a", "b"], name="label")
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
        tm.assert_frame_equal(result, expected)

        exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"])
        exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2)
        exp_col3 = pd.DatetimeIndex(["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, tz="Asia/Tokyo", name="dt2")
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(
            np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]], dtype="int64"),
            index=exp_idx,
            columns=exp_col,
        )

        result = pivot_table(df, index=["dt1"], columns=["dt2"], values=["value1", "value2"], aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
コード例 #9
0
    def test_pivot_table_warnings(self):
        index = ['A', 'B']
        columns = 'C'
        with tm.assert_produces_warning(FutureWarning):
            table = pivot_table(self.data,
                                values='D',
                                rows=index,
                                cols=columns)

        with tm.assert_produces_warning(False):
            table2 = pivot_table(self.data,
                                 values='D',
                                 index=index,
                                 columns=columns)

        tm.assert_frame_equal(table, table2)
コード例 #10
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=["mean", "std"], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(
            self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func, margins=True
        )
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=["mean", "std"], axis=1)
        tm.assert_frame_equal(result, expected)
コード例 #11
0
ファイル: ocean.py プロジェクト: 2LL1/orca
    def frame(self, id, cursor=None, **kwargs):
        stacks = self.stack(id, cursor, **kwargs)

        logger.debug('Start Pivot stacks')
        t1 = Timer()
        stacks['value'] = True
        result = pivot_table(stacks, 'value', 'date', 'stock', fill_value=False)
        result.columns = numpy.char.mod('%06d', result.columns)
        logger.debug('Finished pivoting in %s', t1)
        return result
コード例 #12
0
ファイル: test_pivot.py プロジェクト: perrette/pandas
    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)
コード例 #13
0
    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'int64')

        z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64 = 2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'float64')

        z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64 = 2))
        tm.assert_series_equal(result, expected)
コード例 #14
0
ファイル: test_pivot.py プロジェクト: huang11404/FlaskApp
    def test_pivot_dtaccessor(self):
        # GH 8103
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d))

        result = pivot_table(df, index='label', columns=df['dt1'].dt.hour,
                             values='value1')

        exp_idx = Index(['a', 'b'], name='label')
        expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]},
                             index=exp_idx, columns=Index([7, 8, 9],name='dt1'))
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour,
                             values='value1')

        expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]},
                             index=Index([1, 2],name='dt2'), columns=Index([7, 8, 9],name='dt1'))
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=df['dt2'].dt.year.values,
                             columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
                             values='value1')

        exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3],names=['dt1','dt2'])
        expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]],dtype='int64'),
                             index=[2013], columns=exp_col)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=np.array(['X', 'X', 'X', 'X', 'Y', 'Y']),
                             columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
                             values='value1')
        expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan],
                                       [np.nan, np.nan, np.nan, 4, np.nan, 5]]),
                             index=['X', 'Y'], columns=exp_col)
        tm.assert_frame_equal(result, expected)
コード例 #15
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     index=['A', 'B'], columns='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     index=['A', 'B'], columns='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)
コード例 #16
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({"a": ["cat", "bat", "cat", "bat"], "v": [1, 2, 3, 4], "i": ["a", "b", "a", "b"]})
        self.assertEqual(f.dtypes["v"], "int64")

        z = pivot_table(f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64=2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({"a": ["cat", "bat", "cat", "bat"], "v": [1.5, 2.5, 3.5, 4.5], "i": ["a", "b", "a", "b"]})
        self.assertEqual(f.dtypes["v"], "float64")

        z = pivot_table(f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64=2))
        tm.assert_series_equal(result, expected)
コード例 #17
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'int64')

        z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64 = 2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'float64')

        z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64 = 2))
        tm.assert_series_equal(result, expected)
コード例 #18
0
ファイル: test_pivot.py プロジェクト: fperez/pandas
    def test_pivot_table(self):
        rows = ['A', 'B']
        cols=  'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        assert_frame_equal(table, expected)
コード例 #19
0
 def test_pivot_table_with_margins_set_margin_name(self):
     # GH 3335
     for margin_name in ['foo', 'one', 666, None, ['a', 'b']]:
         with self.assertRaises(ValueError):
             # multi-index index
             pivot_table(self.data,
                         values='D',
                         index=['A', 'B'],
                         columns=['C'],
                         margins=True,
                         margins_name=margin_name)
         with self.assertRaises(ValueError):
             # multi-index column
             pivot_table(self.data,
                         values='D',
                         index=['C'],
                         columns=['A', 'B'],
                         margins=True,
                         margins_name=margin_name)
         with self.assertRaises(ValueError):
             # non-multi-index index/column
             pivot_table(self.data,
                         values='D',
                         index=['A'],
                         columns=['B'],
                         margins=True,
                         margins_name=margin_name)
コード例 #20
0
    def test_pivot_table(self):
        rows = ['A', 'B']
        cols = 'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)
コード例 #21
0
ファイル: test_pivot.py プロジェクト: rishabh11/pandas
    def test_pivot_table(self):
        rows = ["A", "B"]
        cols = "C"
        table = pivot_table(self.data, values="D", rows=rows, cols=cols)

        table2 = self.data.pivot_table(values="D", rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values="D", rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])["D"].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)
コード例 #22
0
ファイル: plotdata.py プロジェクト: freider/keiro
def metric_vs_property(
    data,
    metrics=('completion_time',),
    property='agent_parameter',
    aggfunc=np.mean,
):
    table = pivot_table(
        df,
        values=list(metrics),
        rows=[property],
        #cols=[metric],
        aggfunc=np.mean
    )
    return table
コード例 #23
0
    def frame(self, id, cursor=None, **kwargs):
        stacks = self.stack(id, cursor, **kwargs)

        logger.debug('Start Pivot stacks')
        t1 = Timer()
        stacks['value'] = True
        result = pivot_table(stacks,
                             'value',
                             'date',
                             'stock',
                             fill_value=False)
        result.columns = numpy.char.mod('%06d', result.columns)
        logger.debug('Finished pivoting in %s', t1)
        return result
コード例 #24
0
ファイル: cache_tasks.py プロジェクト: marks/polio
    def add_indicator_data_to_rc_df(self,rc_df, i_id):
        '''
        left join the region / campaign dataframe with the stored data for each
        campaign.
        '''
        column_header = ['region_id','campaign_id']
        column_header.append(i_id)

        indicator_df = DataFrame(list(DataPointComputed.objects.filter(
            indicator_id = i_id).values()))

        pivoted_indicator_df = pivot_table(indicator_df, values='value',\
            columns=['indicator_id'],index = ['region_id','campaign_id'])

        cleaned_df = pivoted_indicator_df.reset_index(level=[0,1], inplace=False)

        merged_df = rc_df.merge(cleaned_df,how='left')
        merged_df = merged_df.reset_index(drop=True)

        return merged_df
コード例 #25
0
ファイル: cache_tasks.py プロジェクト: vivihuang/polio
    def add_indicator_data_to_rc_df(self, rc_df, i_id):
        '''
        left join the region / campaign dataframe with the stored data for each
        campaign.
        '''
        column_header = ['region_id', 'campaign_id']
        column_header.append(i_id)

        indicator_df = DataFrame(
            list(DataPointComputed.objects.filter(indicator_id=i_id).values()))

        pivoted_indicator_df = pivot_table(indicator_df, values='value',\
            columns=['indicator_id'],index = ['region_id','campaign_id'])

        cleaned_df = pivoted_indicator_df.reset_index(level=[0, 1],
                                                      inplace=False)

        merged_df = rc_df.merge(cleaned_df, how='left')
        merged_df = merged_df.reset_index(drop=True)

        return merged_df
コード例 #26
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
 def test_pivot_table_with_margins_set_margin_name(self):
     # GH 3335
     for margin_name in ["foo", "one", 666, None, ["a", "b"]]:
         with self.assertRaises(ValueError):
             # multi-index index
             pivot_table(
                 self.data, values="D", index=["A", "B"], columns=["C"], margins=True, margins_name=margin_name
             )
         with self.assertRaises(ValueError):
             # multi-index column
             pivot_table(
                 self.data, values="D", index=["C"], columns=["A", "B"], margins=True, margins_name=margin_name
             )
         with self.assertRaises(ValueError):
             # non-multi-index index/column
             pivot_table(self.data, values="D", index=["A"], columns=["B"], margins=True, margins_name=margin_name)
コード例 #27
0
 def test_pivot_table_with_margins_set_margin_name(self):
     # GH 3335
     for margin_name in ['foo', 'one', 666, None, ['a', 'b']]:
         with self.assertRaises(ValueError):
             # multi-index index
             pivot_table(self.data, values='D', index=['A', 'B'],
                         columns=['C'], margins=True,
                         margins_name=margin_name)
         with self.assertRaises(ValueError):
             # multi-index column
             pivot_table(self.data, values='D', index=['C'],
                         columns=['A', 'B'], margins=True,
                         margins_name=margin_name)
         with self.assertRaises(ValueError):
             # non-multi-index index/column
             pivot_table(self.data, values='D', index=['A'],
                         columns=['B'], margins=True,
                         margins_name=margin_name)
コード例 #28
0
    def test_pivot_timegrouper(self):
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 1, 1),
                datetime(2013, 1, 1),
                datetime(2013, 10, 1),
                datetime(2013, 10, 2),
                datetime(2013, 10, 1),
                datetime(2013, 10, 2),
                datetime(2013, 12, 2),
                datetime(2013, 12, 2),
            ]
        }).set_index('Date')

        expected = DataFrame(np.array([10, 18, 3],
                                      dtype='int64').reshape(1, 3),
                             index=[datetime(2013, 12, 31)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='A'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='A'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        expected = DataFrame(
            np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
            index=[datetime(2013, 1, 1),
                   datetime(2013, 7, 1)],
            columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df,
                             index=Grouper(freq='6MS'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(df,
                             index=Grouper(freq='6MS', key='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            KeyError, lambda: pivot_table(df,
                                          index=Grouper(freq='6MS', key='foo'),
                                          columns='Buyer',
                                          values='Quantity',
                                          aggfunc=np.sum))
        self.assertRaises(
            KeyError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', key='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # passing the level
        df = df.set_index('Date')
        result = pivot_table(df,
                             index=Grouper(freq='6MS', level='Date'),
                             columns='Buyer',
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index='Buyer',
                             columns=Grouper(freq='6MS', level='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index=Grouper(freq='6MS', level='foo'),
                                columns='Buyer',
                                values='Quantity',
                                aggfunc=np.sum))
        self.assertRaises(
            ValueError,
            lambda: pivot_table(df,
                                index='Buyer',
                                columns=Grouper(freq='6MS', level='foo'),
                                values='Quantity',
                                aggfunc=np.sum))

        # double grouper
        df = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 11, 1, 13, 0),
                datetime(2013, 9, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 11, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 2, 12, 0),
                datetime(2013, 12, 5, 14, 0)
            ],
            'PayDay': [
                datetime(2013, 10, 4, 0, 0),
                datetime(2013, 10, 15, 13, 5),
                datetime(2013, 9, 5, 20, 0),
                datetime(2013, 11, 2, 10, 0),
                datetime(2013, 10, 7, 20, 0),
                datetime(2013, 9, 5, 10, 0),
                datetime(2013, 12, 30, 12, 0),
                datetime(2013, 11, 20, 14, 0),
            ]
        })

        result = pivot_table(df,
                             index=Grouper(freq='M', key='Date'),
                             columns=Grouper(freq='M', key='PayDay'),
                             values='Quantity',
                             aggfunc=np.sum)
        expected = DataFrame(np.array([
            np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan,
            np.nan, np.nan, np.nan, 3, np.nan
        ]).reshape(4, 4),
                             index=[
                                 datetime(2013, 9, 30),
                                 datetime(2013, 10, 31),
                                 datetime(2013, 11, 30),
                                 datetime(2013, 12, 31)
                             ],
                             columns=[
                                 datetime(2013, 9, 30),
                                 datetime(2013, 10, 31),
                                 datetime(2013, 11, 30),
                                 datetime(2013, 12, 31)
                             ])
        expected.index.name = 'Date'
        expected.columns.name = 'PayDay'

        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=Grouper(freq='M', key='PayDay'),
                             columns=Grouper(freq='M', key='Date'),
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        tuples = [
            (datetime(2013, 9, 30), datetime(2013, 10, 31)),
            (datetime(2013, 10, 31), datetime(2013, 9, 30)),
            (datetime(2013, 10, 31), datetime(2013, 11, 30)),
            (datetime(2013, 10, 31), datetime(2013, 12, 31)),
            (datetime(2013, 11, 30), datetime(2013, 10, 31)),
            (datetime(2013, 12, 31), datetime(2013, 11, 30)),
        ]
        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
        expected = DataFrame(np.array(
            [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan,
             3]).reshape(6, 2),
                             index=idx,
                             columns=['A', 'B'])
        expected.columns.name = 'Branch'

        result = pivot_table(df,
                             index=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             columns=['Branch'],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df,
                             index=['Branch'],
                             columns=[
                                 Grouper(freq='M', key='Date'),
                                 Grouper(freq='M', key='PayDay')
                             ],
                             values='Quantity',
                             aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)
コード例 #29
0
ファイル: pivot.py プロジェクト: pszostek/cp
def pivot(data_frames_dict, column_tuples, row_tuples, displayed_value_tuples, filters, ret, div_on_top=False, aggfunc=None):
    """ Returns a pivoted data frame

    data_frames_dict: a dictionary with csv paths as keys and Pandas.DataFrame as values
    column_tuples: ordered list of tuples (csv_path, name_of_chosen_column)
    row_tuples: ordered list of tuples (csv_path, name_of_chosen_column)
    """
    from collections import OrderedDict
    if aggfunc is None:
        aggfunc = 'sum'
    output_data_frames = OrderedDict()
    # make list of needed columns
    second_elem = lambda tup: tup[1]
    row_names = [second_elem(row_tup) for row_tup in row_tuples]
    column_names = [second_elem(row_tup) for row_tup in column_tuples]
    # all_columns = row_names + column_names

    fitlered_data_frames = _apply_filters(data_frames_dict, filters)

    # for each displayed dimension create a data frame, then merge them before displaying
    for displayed_value_tuple in displayed_value_tuples:
        csv_path, displayed_column_name = displayed_value_tuple

        df_column_dict = defaultdict(set)
        df_column_dict[csv_path].add(displayed_column_name)
        for csv_path, column_name in column_tuples:
            df_column_dict[csv_path].add(column_name)
        for csv_path, row_name in row_tuples:
            df_column_dict[csv_path].add(row_name)
        for csv_path, merit_name in displayed_value_tuples:
            df_column_dict[csv_path].add(merit_name)

        # if columns from different tables -> join on the columns with the same
        # name
        needed_data_frames = df_column_dict.keys()
        if len(needed_data_frames) > 1:
            data_frame = _merge(fitlered_data_frames, df_column_dict)
        else:
            data_frame = fitlered_data_frames[df_column_dict.keys()[0]].reset_index()

        chosen_columns = reduce(lambda x, y: x.union(y), df_column_dict.values())
       # data_frame.to_csv("./merged.csv")

        try:
            data_frame_with_dropped_columns = _drop(data_frame=data_frame,
                                                    chosen_columns=chosen_columns)
        except Exception, e:
            raise DropException(str(e))

        column_to_hex(data_frame, 'bb')
        column_to_hex(data_frame, 'bb.1')

        from pandas.tools.pivot import pivot_table
        data_frame = pivot_table(data_frame_with_dropped_columns,
                              values=displayed_column_name,
                              rows=row_names,
                              cols=column_names,
                              fill_value=0,
                              aggfunc=aggfunc)
        output_data_frames[displayed_value_tuple] = data_frame
        print(data_frame)
        print("\n\n")
コード例 #30
0
def runtests(var_file, out_file):
    pvalue_lines = []

    input_file = ""
    with open(src_file, "rb") as sfile:
        for line in sfile.readlines():
            input_file = os.path.abspath(line.strip())

    model_variables = []
    with open(var_file, "rb") as vfile:
        for line in vfile.readlines():
            line = line.strip()
            if not line: continue
            model_variables.append(line.split()[0])

    range_variables = [] 
    with open(range_file, "rb") as rfile:
        for line in rfile.readlines():
            line = line.strip()
            if not line: continue
            parts = line.strip().split()
            if 2 < len(parts):
                range_variables.append({"name":parts[0], "type":parts[1], "range":parts[2].split(",")})

    ignore_records = []
    with open(ignore_file, "rb") as rfile:
        for line in rfile.readlines():
            line = line.strip()
            if not line: continue
            ignore_records.append(line)

    all_data = {}
    for var in model_variables:
        all_data[var] = []
    with open(input_file, "rb") as ifile:
        reader = csv.reader(ifile)
        titles = reader.next()
        model_idx = [titles.index(var) for var in model_variables]
        r0 = 0
        r = 0
        for row in reader:
            if row[0] in ignore_records: continue

            r0 += 1 # Starts at 1, because of titles
            all_missing = True
            some_missing = False
            missing_dvar = row[model_idx[0]] == "\\N"
            for i in range(1, len(model_variables)):
                var_idx = model_idx[i]
                if row[var_idx] == "\\N":
                    some_missing = True
                else:
                    all_missing = False

            inside_range = True
            for var in range_variables:
                idx = titles.index(var["name"])
                val = row[idx]
                if val == "\\N": continue
                vtype = var["type"]
                vrang = var["range"]
                test = True
                if vtype == "category":
                    test = val in vrang
                else:
                    test = float(vrang[0]) <= float(val) and float(val) < float(vrang[1])
                inside_range = inside_range and test

            if not all_missing and not missing_dvar and inside_range:
                for i in range(0, len(model_variables)):
                    var = model_variables[i]
                    idx = model_idx[i]
                    try:
                        val = float(row[idx])
                    except ValueError:
                        val = np.NaN
                    all_data[var].append(val)
                r += 1

    data = pd.DataFrame(all_data)

    cat_vars = []
    num_vars = []
    count = 0
    with open(var_file, "rb") as vfile:
        for line in vfile.readlines():
            line = line.strip()
            if not line: continue
            parts = line.split()
            if count == 0: 
                dvar = parts[0]
                count = 1
                continue
            count += 1
            if parts[1] == "category":
               cat_vars.append(parts[0])
            else:
                num_vars.append(parts[0])

    for var in cat_vars:
        print "***********************"
        print var
        dat = data.loc[:,(var, dvar)]
        dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index)
        # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html
        counts = pivot_table(dat, values="VALUES", index=[var], columns=[dvar], aggfunc=np.sum, fill_value=0)
        fisher_ratio, fisher_pvalue = fisher_exact(counts)
        print counts
        pvalue_lines.append(var + "\t" + str(fisher_pvalue))
        print fisher_ratio, fisher_pvalue

    for var in num_vars:
        values0 = data[data[dvar] == 0][var]
        values1 = data[data[dvar] == 1][var]

        values0 = values0[~np.isnan(values0)]
        values1 = values1[~np.isnan(values1)]

        print "***********************"
        print var
        print "mean/std for",dvar,"0:",np.mean(values0), "/", np.std(values0)
        print "mean/std for",dvar,"1:",np.mean(values1), "/", np.std(values1)
        ttest_stat, ttest_pvalue = ttest_ind(values0, values1, equal_var=False)
        print "Means are different at p-value",ttest_pvalue
        pvalue_lines.append(var + "\t" + str(ttest_pvalue))

    with open(out_file, "w") as ofile:
        for line in pvalue_lines:
            ofile.write(line + "\n")
コード例 #31
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
 def test_pivot_multi_values(self):
     result = pivot_table(self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0)
     expected = pivot_table(self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0)
     tm.assert_frame_equal(result, expected)
コード例 #32
0
 def test_pivot_table_multiple(self):
     rows = ['A', 'B']
     cols=  'C'
     table = pivot_table(self.data, rows=rows, cols=cols)
     expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
     tm.assert_frame_equal(table, expected)
コード例 #33
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_dtaccessor(self):
        # GH 8103
        dates1 = [
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
            "2011-07-19 07:00:00",
            "2011-07-19 08:00:00",
            "2011-07-19 09:00:00",
        ]
        dates2 = [
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-01-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
            "2013-02-01 15:00:00",
        ]
        df = DataFrame(
            {
                "label": ["a", "a", "a", "b", "b", "b"],
                "dt1": dates1,
                "dt2": dates2,
                "value1": np.arange(6, dtype="int64"),
                "value2": [1, 2] * 3,
            }
        )
        df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d))
        df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d))

        result = pivot_table(df, index="label", columns=df["dt1"].dt.hour, values="value1")

        exp_idx = Index(["a", "b"], name="label")
        expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=exp_idx, columns=Index([7, 8, 9], name="dt1"))
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1")

        expected = DataFrame(
            {7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=Index([1, 2], name="dt2"), columns=Index([7, 8, 9], name="dt1")
        )
        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df, index=df["dt2"].dt.year.values, columns=[df["dt1"].dt.hour, df["dt2"].dt.month], values="value1"
        )

        exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"])
        expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df,
            index=np.array(["X", "X", "X", "X", "Y", "Y"]),
            columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
            values="value1",
        )
        expected = DataFrame(
            np.array([[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]),
            index=["X", "Y"],
            columns=exp_col,
        )
        tm.assert_frame_equal(result, expected)
コード例 #34
0
ファイル: test_pivot.py プロジェクト: benracine/pandas
 def test_pivot_multi_values(self):
     result = pivot_table(self.data, values=['D', 'E'],
                          rows='A', cols=['B', 'C'])
     expected = pivot_table(self.data.drop(['F'], axis=1),
                            rows='A', cols=['B', 'C'])
     assert_frame_equal(result, expected)
コード例 #35
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
 def test_pivot_table_multiple(self):
     index = ['A', 'B']
     columns = 'C'
     table = pivot_table(self.data, index=index, columns=columns)
     expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
     tm.assert_frame_equal(table, expected)
コード例 #36
0
 def test_pivot_table_multiple(self):
     rows = ['A', 'B']
     cols = 'C'
     table = pivot_table(self.data, rows=rows, cols=cols)
     expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
     tm.assert_frame_equal(table, expected)
コード例 #37
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
 def test_pivot_multi_values(self):
     result = pivot_table(self.data, values=['D', 'E'],
                          index='A', columns=['B', 'C'], fill_value=0)
     expected = pivot_table(self.data.drop(['F'], axis=1),
                            index='A', columns=['B', 'C'], fill_value=0)
     tm.assert_frame_equal(result, expected)
コード例 #38
0
def runtests(var_file, out_file):
    pvalue_lines = []

    input_file = ""
    with open(src_file, "rb") as sfile:
        for line in sfile.readlines():
            input_file = os.path.abspath(line.strip())

    model_variables = []
    with open(var_file, "rb") as vfile:
        for line in vfile.readlines():
            line = line.strip()
            if not line: continue
            model_variables.append(line.split()[0])

    range_variables = []
    with open(range_file, "rb") as rfile:
        for line in rfile.readlines():
            line = line.strip()
            if not line: continue
            parts = line.strip().split()
            if 2 < len(parts):
                range_variables.append({
                    "name": parts[0],
                    "type": parts[1],
                    "range": parts[2].split(",")
                })

    ignore_records = []
    with open(ignore_file, "rb") as rfile:
        for line in rfile.readlines():
            line = line.strip()
            if not line: continue
            ignore_records.append(line)

    all_data = {}
    for var in model_variables:
        all_data[var] = []
    with open(input_file, "rb") as ifile:
        reader = csv.reader(ifile)
        titles = reader.next()
        model_idx = [titles.index(var) for var in model_variables]
        r0 = 0
        r = 0
        for row in reader:
            if row[0] in ignore_records: continue

            r0 += 1  # Starts at 1, because of titles
            all_missing = True
            some_missing = False
            missing_dvar = row[model_idx[0]] == "\\N"
            for i in range(1, len(model_variables)):
                var_idx = model_idx[i]
                if row[var_idx] == "\\N":
                    some_missing = True
                else:
                    all_missing = False

            inside_range = True
            for var in range_variables:
                idx = titles.index(var["name"])
                val = row[idx]
                if val == "\\N": continue
                vtype = var["type"]
                vrang = var["range"]
                test = True
                if vtype == "category":
                    test = val in vrang
                else:
                    test = float(
                        vrang[0]) <= float(val) and float(val) < float(
                            vrang[1])
                inside_range = inside_range and test

            if not all_missing and not missing_dvar and inside_range:
                for i in range(0, len(model_variables)):
                    var = model_variables[i]
                    idx = model_idx[i]
                    try:
                        val = float(row[idx])
                    except ValueError:
                        val = np.NaN
                    all_data[var].append(val)
                r += 1

    data = pd.DataFrame(all_data)

    cat_vars = []
    num_vars = []
    count = 0
    with open(var_file, "rb") as vfile:
        for line in vfile.readlines():
            line = line.strip()
            if not line: continue
            parts = line.split()
            if count == 0:
                dvar = parts[0]
                count = 1
                continue
            count += 1
            if parts[1] == "category":
                cat_vars.append(parts[0])
            else:
                num_vars.append(parts[0])

    for var in cat_vars:
        print "***********************"
        print var
        dat = data.loc[:, (var, dvar)]
        dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index)
        # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html
        counts = pivot_table(dat,
                             values="VALUES",
                             index=[var],
                             columns=[dvar],
                             aggfunc=np.sum,
                             fill_value=0)
        fisher_ratio, fisher_pvalue = fisher_exact(counts)
        print counts
        pvalue_lines.append(var + "\t" + str(fisher_pvalue))
        print fisher_ratio, fisher_pvalue

    for var in num_vars:
        values0 = data[data[dvar] == 0][var]
        values1 = data[data[dvar] == 1][var]

        values0 = values0[~np.isnan(values0)]
        values1 = values1[~np.isnan(values1)]

        print "***********************"
        print var
        print "mean/std for", dvar, "0:", np.mean(values0), "/", np.std(
            values0)
        print "mean/std for", dvar, "1:", np.mean(values1), "/", np.std(
            values1)
        ttest_stat, ttest_pvalue = ttest_ind(values0, values1, equal_var=False)
        print "Means are different at p-value", ttest_pvalue
        pvalue_lines.append(var + "\t" + str(ttest_pvalue))

    with open(out_file, "w") as ofile:
        for line in pvalue_lines:
            ofile.write(line + "\n")
コード例 #39
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_pivot_timegrouper(self):
        df = DataFrame({
            'Branch' : 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1),
                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
                      datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date')

        expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3),
                             index=[datetime.datetime(2013, 12, 31)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result,expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result,expected.T)

        expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
                             index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'),
                          columns='Buyer', values='Quantity', aggfunc=np.sum))
        self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer',
                          columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum))

        # passing the level
        df = df.set_index('Date')
        result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'),
                          columns='Buyer', values='Quantity', aggfunc=np.sum))
        self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer',
                          columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum))

        # double grouper
        df = DataFrame({
            'Branch' : 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1,3,5,1,8,1,9,3],
            'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5),
                      datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0),
                      datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0),
                      datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)],
            'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5),
                        datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0),
                        datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0),
                        datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]})

        result = pivot_table(df, index=Grouper(freq='M', key='Date'),
                             columns=Grouper(freq='M', key='PayDay'),
                             values='Quantity', aggfunc=np.sum)
        expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9,
                                       np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4),
                             index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)],
                             columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)])
        expected.index.name = 'Date'
        expected.columns.name = 'PayDay'

        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=Grouper(freq='M', key='PayDay'),
                             columns=Grouper(freq='M', key='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)),
                  (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)),
                  (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),]
        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
        expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan,
                                       9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2),
                             index=idx, columns=['A', 'B'])
        expected.columns.name = 'Branch'

        result = pivot_table(df, index=[Grouper(freq='M', key='Date'),
                             Grouper(freq='M', key='PayDay')], columns=['Branch'],
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'),
                             Grouper(freq='M', key='PayDay')],
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)
コード例 #40
0
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title(u'업종분포', fontproperties=fontprop)
pos = arange(20)
pos = pos[::-1] # reverse pos list
plt.yticks(pos, [x.decode('utf8') for x in top20.index], fontproperties=fontprop)
plt.barh(pos, top20.values, align='center', color=colors_list, alpha=0.7)
# plt.show()

df_semi = df[df['sector']=='소프트웨어']
print(df_semi.head(10))

#시가총액 합산
from pandas.tools.pivot import pivot_table
ttable = df[['sector', 'marcap']]
piv = pivot_table(ttable, values='marcap', rows=['sector'], aggfunc=np.sum)
sector_marcap = piv.copy()
sector_marcap.sort(ascending=False)
print(sector_marcap[:10])

#업종별 시가총액 차트
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
fontprop = fm.FontProperties(fname="fonts/malgun.ttf")

top20 = sector_marcap[0:20]

fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title(u'업종별 시가총액', fontproperties=fontprop)
pos = arange(20)
pos = pos[::-1] # reverse pos list
コード例 #41
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_pivot_timegrouper(self):
        df = DataFrame(
            {
                "Branch": "A A A A A A A B".split(),
                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
                "Date": [
                    datetime(2013, 1, 1),
                    datetime(2013, 1, 1),
                    datetime(2013, 10, 1),
                    datetime(2013, 10, 2),
                    datetime(2013, 10, 1),
                    datetime(2013, 10, 2),
                    datetime(2013, 12, 2),
                    datetime(2013, 12, 2),
                ],
            }
        ).set_index("Date")

        expected = DataFrame(
            np.array([10, 18, 3], dtype="int64").reshape(1, 3),
            index=[datetime(2013, 12, 31)],
            columns="Carl Joe Mark".split(),
        )
        expected.index.name = "Date"
        expected.columns.name = "Buyer"

        result = pivot_table(df, index=Grouper(freq="A"), columns="Buyer", values="Quantity", aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index="Buyer", columns=Grouper(freq="A"), values="Quantity", aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        expected = DataFrame(
            np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
            index=[datetime(2013, 1, 1), datetime(2013, 7, 1)],
            columns="Carl Joe Mark".split(),
        )
        expected.index.name = "Date"
        expected.columns.name = "Buyer"

        result = pivot_table(df, index=Grouper(freq="6MS"), columns="Buyer", values="Quantity", aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index="Buyer", columns=Grouper(freq="6MS"), values="Quantity", aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(
            df, index=Grouper(freq="6MS", key="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum
        )
        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df, index="Buyer", columns=Grouper(freq="6MS", key="Date"), values="Quantity", aggfunc=np.sum
        )
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            KeyError,
            lambda: pivot_table(
                df, index=Grouper(freq="6MS", key="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum
            ),
        )
        self.assertRaises(
            KeyError,
            lambda: pivot_table(
                df, index="Buyer", columns=Grouper(freq="6MS", key="foo"), values="Quantity", aggfunc=np.sum
            ),
        )

        # passing the level
        df = df.set_index("Date")
        result = pivot_table(
            df, index=Grouper(freq="6MS", level="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum
        )
        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df, index="Buyer", columns=Grouper(freq="6MS", level="Date"), values="Quantity", aggfunc=np.sum
        )
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(
            ValueError,
            lambda: pivot_table(
                df, index=Grouper(freq="6MS", level="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum
            ),
        )
        self.assertRaises(
            ValueError,
            lambda: pivot_table(
                df, index="Buyer", columns=Grouper(freq="6MS", level="foo"), values="Quantity", aggfunc=np.sum
            ),
        )

        # double grouper
        df = DataFrame(
            {
                "Branch": "A A A A A A A B".split(),
                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
                "Date": [
                    datetime(2013, 11, 1, 13, 0),
                    datetime(2013, 9, 1, 13, 5),
                    datetime(2013, 10, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 11, 1, 20, 0),
                    datetime(2013, 10, 2, 10, 0),
                    datetime(2013, 10, 2, 12, 0),
                    datetime(2013, 12, 5, 14, 0),
                ],
                "PayDay": [
                    datetime(2013, 10, 4, 0, 0),
                    datetime(2013, 10, 15, 13, 5),
                    datetime(2013, 9, 5, 20, 0),
                    datetime(2013, 11, 2, 10, 0),
                    datetime(2013, 10, 7, 20, 0),
                    datetime(2013, 9, 5, 10, 0),
                    datetime(2013, 12, 30, 12, 0),
                    datetime(2013, 11, 20, 14, 0),
                ],
            }
        )

        result = pivot_table(
            df,
            index=Grouper(freq="M", key="Date"),
            columns=Grouper(freq="M", key="PayDay"),
            values="Quantity",
            aggfunc=np.sum,
        )
        expected = DataFrame(
            np.array(
                [np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]
            ).reshape(4, 4),
            index=[datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31)],
            columns=[datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31)],
        )
        expected.index.name = "Date"
        expected.columns.name = "PayDay"

        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df,
            index=Grouper(freq="M", key="PayDay"),
            columns=Grouper(freq="M", key="Date"),
            values="Quantity",
            aggfunc=np.sum,
        )
        tm.assert_frame_equal(result, expected.T)

        tuples = [
            (datetime(2013, 9, 30), datetime(2013, 10, 31)),
            (datetime(2013, 10, 31), datetime(2013, 9, 30)),
            (datetime(2013, 10, 31), datetime(2013, 11, 30)),
            (datetime(2013, 10, 31), datetime(2013, 12, 31)),
            (datetime(2013, 11, 30), datetime(2013, 10, 31)),
            (datetime(2013, 12, 31), datetime(2013, 11, 30)),
        ]
        idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"])
        expected = DataFrame(
            np.array([3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2),
            index=idx,
            columns=["A", "B"],
        )
        expected.columns.name = "Branch"

        result = pivot_table(
            df,
            index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
            columns=["Branch"],
            values="Quantity",
            aggfunc=np.sum,
        )
        tm.assert_frame_equal(result, expected)

        result = pivot_table(
            df,
            index=["Branch"],
            columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")],
            values="Quantity",
            aggfunc=np.sum,
        )
        tm.assert_frame_equal(result, expected.T)
コード例 #42
0
 def test_pivot_table_multiple(self):
     index = ['A', 'B']
     columns = 'C'
     table = pivot_table(self.data, index=index, columns=columns)
     expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
     tm.assert_frame_equal(table, expected)