コード例 #1
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)
コード例 #2
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
コード例 #3
0
ファイル: reshape.py プロジェクト: brianholland/pandas
class Unstack(object):

    params = ['int', 'category']

    def setup(self, dtype):
        m = 100
        n = 1000

        levels = np.arange(m)
        index = MultiIndex.from_product([levels] * 2)
        columns = np.arange(n)
        if dtype == 'int':
            values = np.arange(m * m * n).reshape(m * m, n)
        else:
            # the category branch is ~20x slower than int. So we
            # cut down the size a bit. Now it's only ~3x slower.
            n = 50
            columns = columns[:n]
            indices = np.random.randint(0, 52, size=(m * m, n))
            values = np.take(list(string.ascii_letters), indices)
            values = [pd.Categorical(v) for v in values.T]

        self.df = DataFrame(values, index, columns)
        self.df2 = self.df.iloc[:-1]

    def time_full_product(self, dtype):
        self.df.unstack()

    def time_without_last_row(self, dtype):
        self.df2.unstack()
コード例 #4
0
    def test_unstack_non_unique_index_names(self):
        idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"])
        df = DataFrame([1, 2], index=idx)
        with tm.assertRaises(ValueError):
            df.unstack("c1")

        with tm.assertRaises(ValueError):
            df.T.stack("c1")
コード例 #5
0
ファイル: test_reshape.py プロジェクト: DusanMilunovic/pandas
    def test_unstack_non_unique_index_names(self):
        idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
                                     names=['c1', 'c1'])
        df = DataFrame([1, 2], index=idx)
        with pytest.raises(ValueError):
            df.unstack('c1')

        with pytest.raises(ValueError):
            df.T.stack('c1')
コード例 #6
0
ファイル: test_reshape.py プロジェクト: ivannz/pandas
    def test_stack_unstack(self):
        stacked = self.frame.stack()
        stacked_df = DataFrame({'foo': stacked, 'bar': stacked})

        unstacked = stacked.unstack()
        unstacked_df = stacked_df.unstack()

        assert_frame_equal(unstacked, self.frame)
        assert_frame_equal(unstacked_df['bar'], self.frame)

        unstacked_cols = stacked.unstack(0)
        unstacked_cols_df = stacked_df.unstack(0)
        assert_frame_equal(unstacked_cols.T, self.frame)
        assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)
コード例 #7
0
ファイル: reshape.py プロジェクト: brianholland/pandas
class SparseIndex(object):

    def setup(self):
        NUM_ROWS = 1000
        self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS),
                             'B': np.random.randint(50, size=NUM_ROWS),
                             'C': np.random.randint(-10, 10, size=NUM_ROWS),
                             'D': np.random.randint(-10, 10, size=NUM_ROWS),
                             'E': np.random.randint(10, size=NUM_ROWS),
                             'F': np.random.randn(NUM_ROWS)})
        self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E'])

    def time_unstack(self):
        self.df.unstack()
コード例 #8
0
    def test_stack_unstack(self):
        stacked = self.frame.stack()
        stacked_df = DataFrame({"foo": stacked, "bar": stacked})

        unstacked = stacked.unstack()
        unstacked_df = stacked_df.unstack()

        assert_frame_equal(unstacked, self.frame)
        assert_frame_equal(unstacked_df["bar"], self.frame)

        unstacked_cols = stacked.unstack(0)
        unstacked_cols_df = stacked_df.unstack(0)
        assert_frame_equal(unstacked_cols.T, self.frame)
        assert_frame_equal(unstacked_cols_df["bar"].T, self.frame)
コード例 #9
0
ファイル: reshape.py プロジェクト: brianholland/pandas
class SimpleReshape(object):

    def setup(self):
        arrays = [np.arange(100).repeat(100),
                  np.roll(np.tile(np.arange(100), 100), 25)]
        index = MultiIndex.from_arrays(arrays)
        self.df = DataFrame(np.random.randn(10000, 4), index=index)
        self.udf = self.df.unstack(1)

    def time_stack(self):
        self.udf.stack()

    def time_unstack(self):
        self.df.unstack(1)
コード例 #10
0
    def test_boxplot_legacy(self):
        grouped = self.hist_df.groupby(by='gender')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
        tuples = lzip(string.ascii_letters[:10], range(10))
        df = DataFrame(np.random.rand(10, 3),
                       index=MultiIndex.from_tuples(tuples))

        grouped = df.groupby(level=1)
        axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))

        grouped = df.unstack(level=1).groupby(level=0, axis=1)
        axes = _check_plot_works(grouped.boxplot, return_type='axes')
        self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2))

        axes = _check_plot_works(grouped.boxplot, subplots=False,
                                 return_type='axes')
        self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
コード例 #11
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
    def test_stack_unstack(self):
        df = self.frame.copy()
        df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)

        stacked = df.stack()
        stacked_df = DataFrame({'foo': stacked, 'bar': stacked})

        unstacked = stacked.unstack()
        unstacked_df = stacked_df.unstack()

        assert_frame_equal(unstacked, df)
        assert_frame_equal(unstacked_df['bar'], df)

        unstacked_cols = stacked.unstack(0)
        unstacked_cols_df = stacked_df.unstack(0)
        assert_frame_equal(unstacked_cols.T, df)
        assert_frame_equal(unstacked_cols_df['bar'].T, df)
コード例 #12
0
 def test_unstack_bool(self):
     df = DataFrame([False, False], index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), columns=["col"])
     rs = df.unstack()
     xp = DataFrame(
         np.array([[False, np.nan], [np.nan, False]], dtype=object),
         index=["a", "b"],
         columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
     )
     assert_frame_equal(rs, xp)
コード例 #13
0
 def test_boxplot_legacy3(self):
     tuples = zip(string.ascii_letters[:10], range(10))
     df = DataFrame(np.random.rand(10, 3),
                    index=MultiIndex.from_tuples(tuples))
     grouped = df.unstack(level=1).groupby(level=0, axis=1)
     with tm.assert_produces_warning(UserWarning):
         axes = _check_plot_works(grouped.boxplot, return_type='axes')
     self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2))
     axes = _check_plot_works(grouped.boxplot, subplots=False,
                              return_type='axes')
     self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
コード例 #14
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
 def test_unstack_bool(self):
     df = DataFrame([False, False],
                    index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
                    columns=['col'])
     rs = df.unstack()
     xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
                             dtype=object),
                    index=['a', 'b'],
                    columns=MultiIndex.from_arrays([['col', 'col'],
                                                    ['c', 'l']]))
     assert_frame_equal(rs, xp)
コード例 #15
0
ファイル: reshape.py プロジェクト: Michael-E-Rose/pandas
class Unstack(object):

    goal_time = 0.2

    def setup(self):
        m = 100
        n = 1000

        levels = np.arange(m)
        index = MultiIndex.from_product([levels] * 2)
        columns = np.arange(n)
        values = np.arange(m * m * n).reshape(m * m, n)
        self.df = DataFrame(values, index, columns)
        self.df2 = self.df.iloc[:-1]

    def time_full_product(self):
        self.df.unstack()

    def time_without_last_row(self):
        self.df2.unstack()
コード例 #16
0
 def test_boxplot_legacy3(self):
     tuples = zip(string.ascii_letters[:10], range(10))
     df = DataFrame(np.random.rand(10, 3),
                    index=MultiIndex.from_tuples(tuples))
     grouped = df.unstack(level=1).groupby(level=0, axis=1)
     with tm.assert_produces_warning(UserWarning):
         axes = _check_plot_works(grouped.boxplot, return_type="axes")
     self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2))
     axes = _check_plot_works(grouped.boxplot,
                              subplots=False,
                              return_type="axes")
     self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
コード例 #17
0
def read_gmt(filepath,
             gene_sets=(),
             drop_description=True,
             save_clean=False,
             collapse=False):
    """
    Read GMT.
    :param filepath: str; filepath to a .gmt compress
    :param gene_sets: iterable: list of gene set names to keep
    :param drop_description: bool; drop Description column (2nd column) or not
    :param save_clean: bool; Save as .gmt (cleaned version) or not
    :param collapse: bool; collapse into a list of unique genes or not
    :return: DataFrame or list; (n_gene_sets, size of the largest gene set) or (n_unique genes)
    """

    # Parse
    rows = []
    with open(filepath) as f:
        for line in f.readlines():
            line_split = line.strip().split('\t')
            # Sort genes and add as a GMT gene set (row)
            rows.append(line_split[:2] +
                        sorted([g for g in line_split[2:] if g]))

    # Make a DataFrame
    gmt = DataFrame(rows)

    # Set index
    gmt.set_index(0, inplace=True)
    gmt.index.name = 'Gene Set'
    gmt.sort_index(inplace=True)
    gmt.columns = ['Description'
                   ] + ['Gene {}'.format(i) for i in range(1, gmt.shape[1])]

    if save_clean:  # Save the cleaned version
        gmt.to_csv(filepath, sep='\t', header=False)

    if drop_description or collapse:
        gmt.drop('Description', axis=1, inplace=True)

    # Keep specific gene sets
    if isinstance(gene_sets, str):
        gene_sets = [gene_sets]
    if any(gene_sets):
        gene_sets = sorted(set(gmt.index) & set(gene_sets))
        gmt = gmt.ix[gene_sets, :]
        gmt.dropna(axis=1, how='all', inplace=True)

    if collapse:
        return sorted(set(gmt.unstack().dropna()))
    else:
        return gmt
コード例 #18
0
 def test_unstack_bool(self):
     df = DataFrame(
         [False, False],
         index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]),
         columns=["col"],
     )
     rs = df.unstack()
     xp = DataFrame(
         np.array([[False, np.nan], [np.nan, False]], dtype=object),
         index=["a", "b"],
         columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
     )
     tm.assert_frame_equal(rs, xp)
コード例 #19
0
def hierachicalIndexingDataFrame():
    #Allows to have multiple (two or more) index levels on an axis. Provides a way for you to work with higher dimensional data in lower dimensional form
    df = DataFrame(np.arange(12).reshape(4,3),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'],
                            ['Green', 'Red','Green']])
    print (df)
    print (df['Ohio'])
    print (df.unstack())
    df.index.names = ['key1','key2']
    df.columns.names = ['state','color']
    print (df)
    print (df.swaplevel('key1','key2'))
コード例 #20
0
    def test_unstack_to_series(self):
        # check reversibility
        data = self.frame.unstack()

        self.assertTrue(isinstance(data, Series))
        undo = data.unstack().T
        assert_frame_equal(undo, self.frame)

        # check NA handling
        data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
        data.index = Index(["a", "b", "c"])
        result = data.unstack()

        midx = MultiIndex(levels=[["x", "y"], ["a", "b", "c"]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
        expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)

        assert_series_equal(result, expected)

        # check composability of unstack
        old_data = data.copy()
        for _ in range(4):
            data = data.unstack()
        assert_frame_equal(old_data, data)
コード例 #21
0
ファイル: test_reshape.py プロジェクト: botplex/pandas
    def test_unstack_mixed_type_name_in_multiindex(self, unstack_idx,
                                                   expected_values,
                                                   expected_index,
                                                   expected_columns):
        # GH 19966
        idx = pd.MultiIndex.from_product([["a", "b"], [1, 2], [3, 4]],
                                         names=[("A", "a"), "B", "C"])
        df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx)
        result = df.unstack(unstack_idx)

        expected = DataFrame(expected_values,
                             columns=expected_columns,
                             index=expected_index)
        tm.assert_frame_equal(result, expected)
コード例 #22
0
def slide_8():
    data = DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['Ohio', 'Colorado'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))
    print data
    result = data.stack()
    print '***stack()***'
    print result
    print '***unstack()***'
    print result.unstack()

    print '***unstack(0)***'
    print result.unstack(0)

    print "***unstack('state')***"
    print result.unstack('state')

    s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
    s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
    data2 = pd.concat([s1, s2], keys=['one', 'two'])
    print '***unstack***'
    print data2.unstack()
    print '***unstack->stack***'
    print data2.unstack().stack()
    print '***unstack->stack(dropna)***'
    print data2.unstack().stack(dropna=False)

    df = DataFrame({'left': result, 'right': result + 5},
                   columns=pd.Index(['left', 'right'],
                   name='side'))
    print 'df'
    print df

    print "unstack('state')"
    print df.unstack('state')
    print "unstack('state').stack('side')"
    print df.unstack('state').stack('side')
コード例 #23
0
ファイル: eda.py プロジェクト: talaikis/trading_qlearning2
def pca_results(good_data, pca):
    '''
    Create a DataFrame of the PCA results. Includes dimension feature weights
    and explained variance Visualizes the PCA results
    :param good_data: DataFrame. all dataset log transformed with 6 columns
    :param pca: Sklearn Object. a PCA decomposition object already fitted
    '''
    # Dimension indexing
    dimensions = dimensions = [
        'Dimension {}'.format(i) for i in range(1,
                                                len(pca.components_) + 1)
    ]

    # PCA components
    components = DataFrame(round(pca.components_, 4),
                           columns=list(good_data.keys()))
    components.index = dimensions

    # PCA explained variance
    ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
    variance_ratios = DataFrame(round(ratios, 4),
                                columns=['Explained Variance'])
    variance_ratios.index = dimensions

    # reshape the data to be plotted
    df_aux = components.unstack().reset_index()
    df_aux.columns = ['Feature', 'Dimension', 'Variance']

    # Create a bar plot visualization
    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot the feature weights as a function of the components
    barplot(x='Dimension', y='Variance', hue='Feature', data=df_aux, ax=ax)
    ax.set_ylabel('Feature Weights')
    ax.set_xlabel('')
    ax.set_xticklabels(dimensions, rotation=0)

    # Display the explained variance ratios
    for i, ev in enumerate(pca.explained_variance_ratio_):
        ax.text(i - 0.40,
                ax.get_ylim()[1] + 0.05,
                'Explained Variance\n          %.4f' % (ev))

    # insert a title
    # ax.set_title('PCA Explained Variance Ratio',
    #              fontsize=16, y=1.10)

    # Return a concatenated DataFrame
    return concat([variance_ratios, components], axis=1)
コード例 #24
0
ファイル: process_data.py プロジェクト: Elina1021/autohome_ne
def process_data(result_dic, dic_ne, open_file=False):
    df = DataFrame(result_dic)
    df = df.unstack().unstack(level=-1).reset_index().rename(
        columns={'index': 'model_id'})
    columns = ['model_id']
    columns += list(dic_ne.values())
    df = df[columns]
    if open_file:
        try:
            filename = 'output.txt'
            df.to_csv(filename, sep='\t', index=False)
            os.startfile(filename)
        except Exception as e:
            print(e)
    return df
コード例 #25
0
    def test_unstack_to_series(self):
        # check reversibility
        data = self.frame.unstack()

        assert isinstance(data, Series)
        undo = data.unstack().T
        assert_frame_equal(undo, self.frame)

        # check NA handling
        data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
        data.index = Index(['a', 'b', 'c'])
        result = data.unstack()

        midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
                          labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
        expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)

        assert_series_equal(result, expected)

        # check composability of unstack
        old_data = data.copy()
        for _ in range(4):
            data = data.unstack()
        assert_frame_equal(old_data, data)
コード例 #26
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
        )

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
        )
        tm.assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        tm.assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
        )
        tm.assert_frame_equal(result, expected)
コード例 #27
0
ファイル: test_reshape.py プロジェクト: DusanMilunovic/pandas
    def test_unstack_to_series(self):
        # check reversibility
        data = self.frame.unstack()

        assert isinstance(data, Series)
        undo = data.unstack().T
        assert_frame_equal(undo, self.frame)

        # check NA handling
        data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
        data.index = Index(['a', 'b', 'c'])
        result = data.unstack()

        midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
                          codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
        expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)

        assert_series_equal(result, expected)

        # check composability of unstack
        old_data = data.copy()
        for _ in range(4):
            data = data.unstack()
        assert_frame_equal(old_data, data)
コード例 #28
0
ファイル: test_graphics.py プロジェクト: nfoti/pandas
    def test_boxplot(self):
        df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"])
        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
        grouped = df.groupby(by="X")
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)

        tuples = lzip(string.ascii_letters[:10], range(10))
        df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples))
        grouped = df.groupby(level=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
        grouped = df.unstack(level=1).groupby(level=0, axis=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
コード例 #29
0
    def pivot_data_frame(self, data_frame: pd.DataFrame,
                         pivot_dimensions: List[str],
                         transpose: bool) -> Tuple[pd.DataFrame, bool, bool]:
        """
        Pivot and transpose the data frame. Dimensions including in the `pivot` arg will be unshifted to columns. If
        `transpose` is True the data frame will be transposed. If there is only index level in the data frame (ie. one
        dimension), and that dimension is pivoted, then the data frame will just be transposed. If there is a single
        metric in the data frame and at least one dimension pivoted, the metrics column level will be dropped for
        simplicity.

        :param data_frame:
            The result set data frame
        :param pivot_dimensions:
            A list of index aliases for `data_frame` of levels to shift
        :param transpose:
            A boolean true or false whether to transpose the data frame.
        :return:
            Tuple(The shifted/transposed data frame, is_pivoted, is_transposed)
        """
        is_pivoted = False
        is_transposed = False

        if not self._should_data_frame_be_transformed(
                data_frame, pivot_dimensions, transpose):
            return self.sort_data_frame(data_frame), is_pivoted, is_transposed

        # NOTE: Don't pivot a single dimension data frame. This turns the data frame into a series and pivots the
        # metrics anyway. Instead, transpose the data frame.
        should_transpose_instead_of_pivot = len(pivot_dimensions) == len(
            data_frame.index.names)

        if pivot_dimensions and not should_transpose_instead_of_pivot:
            data_frame = data_frame.unstack(level=pivot_dimensions)
            is_pivoted = True

        if transpose or should_transpose_instead_of_pivot:
            data_frame = data_frame.transpose()
            is_transposed = True

        # If there are more than one column levels and the last level is a single metric, drop the level
        if isinstance(data_frame.columns, pd.MultiIndex) and 1 == len(
                data_frame.columns.levels[0]):
            data_frame.name = data_frame.columns.levels[0][
                0]  # capture the name of the metrics column
            data_frame.columns = data_frame.columns.droplevel(
                0)  # drop the metrics level

        return self.sort_data_frame(data_frame), is_pivoted, is_transposed
コード例 #30
0
ファイル: test_graphics.py プロジェクト: r0k3/pandas
    def test_boxplot(self):
        df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] )
        df['X'] = Series(['A','A','A','A','A','B','B','B','B','B'])
        grouped = df.groupby(by='X')
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)

        tuples = zip(list(string.ascii_letters[:10]), range(10))
        df = DataFrame(np.random.rand(10, 3),
                       index=MultiIndex.from_tuples(tuples))
        grouped = df.groupby(level=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
        grouped = df.unstack(level=1).groupby(level=0, axis=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
コード例 #31
0
    def test_boxplot(self):
        df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
        df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
        grouped = df.groupby(by='X')
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)

        tuples = list(zip(list(string.ascii_letters[:10]), list(range(10))))
        df = DataFrame(np.random.rand(10, 3),
                       index=MultiIndex.from_tuples(tuples))
        grouped = df.groupby(level=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
        grouped = df.unstack(level=1).groupby(level=0, axis=1)
        _check_plot_works(grouped.boxplot)
        _check_plot_works(grouped.boxplot, subplots=False)
コード例 #32
0
ファイル: test_reshape.py プロジェクト: botplex/pandas
    def test_unstack_swaplevel_sortlevel(self, level):
        # GH 20994
        mi = pd.MultiIndex.from_product([[0], ["d", "c"]],
                                        names=["bar", "baz"])
        df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"])
        df.columns.name = "foo"

        expected = DataFrame(
            [[3, 1, 2, 0]],
            columns=pd.MultiIndex.from_tuples([("c", "A"), ("c", "B"),
                                               ("d", "A"), ("d", "B")],
                                              names=["baz", "foo"]),
        )
        expected.index.name = "bar"

        result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
        tm.assert_frame_equal(result, expected)
コード例 #33
0
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples(
            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
        )

        result = data.unstack(fill_value=-1)
        expected = DataFrame(
            {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
        )
        tm.assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame(
            {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float
        )
        tm.assert_frame_equal(result, expected)

        # GH #13971: fill_value when unstacking multiple levels:
        df = DataFrame(
            {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
        ).set_index(["x", "y", "z"])
        unstacked = df.unstack(["x", "y"], fill_value=0)
        key = ("w", "b", "j")
        expected = unstacked[key]
        result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
        tm.assert_series_equal(result, expected)

        stacked = unstacked.stack(["x", "y"])
        stacked.index = stacked.index.reorder_levels(df.index.names)
        # Workaround for GH #17886 (unnecessarily casts to float):
        stacked = stacked.astype(np.int64)
        result = stacked.loc[df.index]
        tm.assert_frame_equal(result, df)

        # From a series
        s = df["w"]
        result = s.unstack(["x", "y"], fill_value=0)
        expected = unstacked["w"]
        tm.assert_frame_equal(result, expected)
コード例 #34
0
ファイル: test_reshape.py プロジェクト: botplex/pandas
    def test_unstack_mixed_extension_types(self, level):
        index = pd.MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 1)],
                                          names=["a", "b"])
        df = DataFrame(
            {
                "A": pd.core.arrays.integer_array([0, 1, None]),
                "B": pd.Categorical(["a", "a", "b"]),
            },
            index=index,
        )

        result = df.unstack(level=level)
        expected = df.astype(object).unstack(level=level)

        expected_dtypes = Series([df.A.dtype] * 2 + [df.B.dtype] * 2,
                                 index=result.columns)
        tm.assert_series_equal(result.dtypes, expected_dtypes)
        tm.assert_frame_equal(result.astype(object), expected)
コード例 #35
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack(fill_value=-1)
        expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
                             index=['x', 'y', 'z'], dtype=np.int16)
        assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
                             index=['x', 'y', 'z'], dtype=np.float)
        assert_frame_equal(result, expected)

        # GH #13971: fill_value when unstacking multiple levels:
        df = DataFrame({'x': ['a', 'a', 'b'],
                        'y': ['j', 'k', 'j'],
                        'z': [0, 1, 2],
                        'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
        unstacked = df.unstack(['x', 'y'], fill_value=0)
        key = ('w', 'b', 'j')
        expected = unstacked[key]
        result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
        assert_series_equal(result, expected)

        stacked = unstacked.stack(['x', 'y'])
        stacked.index = stacked.index.reorder_levels(df.index.names)
        # Workaround for GH #17886 (unnecessarily casts to float):
        stacked = stacked.astype(np.int64)
        result = stacked.loc[df.index]
        assert_frame_equal(result, df)

        # From a series
        s = df['w']
        result = s.unstack(['x', 'y'], fill_value=0)
        expected = unstacked['w']
        assert_frame_equal(result, expected)
コード例 #36
0
    def test_unstack_fill(self):

        # GH #9746: fill_value keyword argument for Series
        # and DataFrame unstack

        # From a series
        data = Series([1, 2, 4, 5], dtype=np.int16)
        data.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = data.unstack(fill_value=-1)
        expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
                             index=['x', 'y', 'z'], dtype=np.int16)
        assert_frame_equal(result, expected)

        # From a series with incorrect data type for fill_value
        result = data.unstack(fill_value=0.5)
        expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
                             index=['x', 'y', 'z'], dtype=np.float)
        assert_frame_equal(result, expected)

        # GH #13971: fill_value when unstacking multiple levels:
        df = DataFrame({'x': ['a', 'a', 'b'],
                        'y': ['j', 'k', 'j'],
                        'z': [0, 1, 2],
                        'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
        unstacked = df.unstack(['x', 'y'], fill_value=0)
        key = ('w', 'b', 'j')
        expected = unstacked[key]
        result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
        assert_series_equal(result, expected)

        stacked = unstacked.stack(['x', 'y'])
        stacked.index = stacked.index.reorder_levels(df.index.names)
        # Workaround for GH #17886 (unnecessarily casts to float):
        stacked = stacked.astype(np.int64)
        result = stacked.loc[df.index]
        assert_frame_equal(result, df)

        # From a series
        s = df['w']
        result = s.unstack(['x', 'y'], fill_value=0)
        expected = unstacked['w']
        assert_frame_equal(result, expected)
コード例 #37
0
def test_info_memory_usage_bug_on_multiindex():
    # GH 14308
    # memory usage introspection should not materialize .values

    def memory_usage(f):
        return f.memory_usage(deep=True).sum()

    N = 100
    M = len(uppercase)
    index = MultiIndex.from_product(
        [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"]
    )
    df = DataFrame({"value": np.random.randn(N * M)}, index=index)

    unstacked = df.unstack("id")
    assert df.values.nbytes == unstacked.values.nbytes
    assert memory_usage(df) > memory_usage(unstacked)

    # high upper bound
    assert memory_usage(unstacked) - memory_usage(df) < 2000
コード例 #38
0
ファイル: test_reshape.py プロジェクト: botplex/pandas
 def test_unstack_long_index(self):
     # PH 32624: Error when using a lot of indices to unstack.
     # The error occurred only, if a lot of indices are used.
     df = DataFrame(
         [[1]],
         columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]),
         index=pd.MultiIndex.from_tuples(
             [[0, 0, 1, 0, 0, 0, 1]],
             names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"],
         ),
     )
     result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"])
     expected = DataFrame(
         [[1]],
         columns=pd.MultiIndex.from_tuples(
             [[0, 0, 1, 0, 0, 0, 1]],
             names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"],
         ),
         index=pd.Index([0], name="i1"),
     )
     tm.assert_frame_equal(result, expected)
コード例 #39
0
ファイル: helpers.py プロジェクト: angoladb/gtfs_kit
def unstack_time_series(time_series: pd.DataFrame) -> pd.DataFrame:
    """
    Given a route, stop, or feed time series of the form output by the functions,
    :func:`compute_stop_time_series`, :func:`compute_route_time_series`, or
    :func:`compute_feed_time_series`, respectively, unstack it to return a DataFrame
    of with the columns:

    - ``"datetime"``
    - the columns ``time_series.columns.names``
    - ``"value"``: value at the datetime and other columns

    """
    col_names = time_series.columns.names
    return (time_series.unstack().pipe(
        pd.DataFrame).reset_index().rename(columns={
            0: "value",
            "level_2": "datetime"
        })
            # Reorder columns
            .filter(["datetime"] + col_names +
                    ["value"]).sort_values(["datetime"] + col_names))
コード例 #40
0
def test_unstack_with_missing_int_cast_to_float():
    # https://github.com/pandas-dev/pandas/issues/37115
    df = DataFrame(
        {"a": ["A", "A", "B"], "b": ["ca", "cb", "cb"], "v": [10] * 3}
    ).set_index(["a", "b"])

    # add another int column to get 2 blocks
    df["is_"] = 1
    assert len(df._mgr.blocks) == 2

    result = df.unstack("b")
    result[("is_", "ca")] = result[("is_", "ca")].fillna(0)

    expected = DataFrame(
        [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]],
        index=Index(["A", "B"], dtype="object", name="a"),
        columns=MultiIndex.from_tuples(
            [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"],
        ),
    )
    tm.assert_frame_equal(result, expected)
コード例 #41
0
ファイル: models.py プロジェクト: Midnighter/pyorganism
    def load_frame(cls, session, experiment):
        """
        Load part of the table into a well-formatted pandas.DataFrame.

        session can be any object with the execute method.
        """
        table = cls.__table__
        stmt = select([table.c.feature, table.c.point, table.c.level]).where(
                table.c.experiment_id == experiment.id)
        query = session.execute(stmt)
        df = DataFrame(iter(query), columns=query.keys())
        df.set_index(["feature", "point"], inplace=True)
        series = df.unstack()
        series.columns = series.columns.droplevel()
        # time points can become unsorted in database, sort them
        series = series.reindex_axis(
                series.columns[ argsort(series.columns.astype(int).values)],
                axis=1, copy=False)
        if experiment.knockouts is not None:
            series.loc[[ko.feature for ko in experiment.knockouts]] =  nan
        return series
コード例 #42
0
def cluster_columns(colsim: DataFrame,
                    clus: AgglomerativeClustering,
                    pi=None) -> Dict[int, int]:
    """Cluster columns from different tables together within a cluster of tables

    Column similarities within one table are set to 0 to prevent different columns
    within one table from linking.

    Args:
        colsim: Dataframe of column similarities
        clus: Agglomerative clustering method
        pi: Partition information (for debugging)
    
    Returns:
        ``{column index: partition column index}``
    """
    # Don't allow different columns within one table to link
    colsim = colsim[(colsim["ti1"] != colsim["ti2"]) |
                    (colsim["ci1"] == colsim["ci2"])]
    colsim = colsim.set_index(["ci1", "ci2"])[0]
    colsim = colsim[~colsim.index.duplicated()]

    # Make symmetric distance matrix
    d = 1 - colsim.unstack().sort_index(0).sort_index(1).fillna(0)
    d = pd.DataFrame(np.minimum(d, d.T))

    log.debug(f"Clustering {d.shape} column similarities")
    try:
        partcols = clus.fit_predict(d)
    except:
        partcols = range(len(d.index))

    # Sort cluster columns by frequency
    partcol_rank = {  # type: ignore
        pci: r
        for r, (pci, _) in enumerate(Counter(partcols).most_common())
    }
    partcols = [partcol_rank[pci] for pci in partcols]

    return dict(zip(d.index, partcols))
コード例 #43
0
ファイル: test_repr_info.py プロジェクト: MattRijk/pandas
    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product(
            [list(uppercase), pd.date_range("20160101", periods=N)], names=["id", "date"]
        )
        df = DataFrame({"value": np.random.randn(N * M)}, index=index)

        unstacked = df.unstack("id")
        self.assertEqual(df.values.nbytes, unstacked.values.nbytes)
        self.assertTrue(memory_usage(df) > memory_usage(unstacked))

        # high upper bound
        self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)
コード例 #44
0
ファイル: utils.py プロジェクト: amv213/pycotech
def to_pico_stream(df: pd.DataFrame) -> pd.DataFrame:
    """Flattens a PicoLog PLW Player data dataframe to a virtual
    data-stream - simulating sequential acquisition of data channel-by-channel.

    For an input dataframe of shape (num_samples, num_channels), the output
    data-stream will have length num_samples x num_channels.

    Args:
        df: PicoLog PLW Player dataframe, where each row has temperature
            measurements across PicoLogger acquisition channels.

    Returns:
        Equivalent flattened data-stream dataframe, where each row has a
        temperature measurement from a single PicoLogger acquisition channel.

        index:      None (enumeration of entries)
        columns:    `channel`, `temp`
    """

    channels_order = df.columns.values

    # Melt the dataframe and rename columns
    df = df.unstack().reset_index()
    df.columns = ['channel', 'Time', 'temp']

    # attach a categorical ordered data type to channel values so that they
    # maintain the acquisition order when sorting
    t = pd.CategoricalDtype(categories=channels_order, ordered=True)
    df['channel'] = pd.Series(df['channel'], dtype=t)

    # Sort values by time and channel order
    df.sort_values(['Time', 'channel'], inplace=True)

    # Reset `Time` index to give each entry virtual time id
    df.set_index('Time', inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df
コード例 #45
0
ファイル: reshape.py プロジェクト: payman21/cjworkbench
def long_to_wide(table: pd.DataFrame, keycolnames: List[str],
                 varcolname: str) -> pd.DataFrame:
    varcol = table[varcolname]
    if varcol.dtype != object and not hasattr(varcol, 'cat'):
        error = (
            'Column "%s" was auto-converted to Text because column names must '
            'be text.' % varcolname)
        quick_fixes = [{
            'text': 'Convert "%s" to text' % varcolname,
            'action': 'prependModule',
            'args': ['converttotext', {
                'colnames': varcolname
            }],
        }]
        na = varcol.isnull()
        varcol = varcol.astype(str)
        varcol[na] = np.nan
        table[varcolname] = varcol
    else:
        error = None
        quick_fixes = None

    table.set_index(keycolnames + [varcolname], inplace=True, drop=True)
    if np.any(table.index.duplicated()):
        return 'Cannot reshape: some variables are repeated'

    table = table.unstack()
    table.columns = [col[-1] for col in table.columns.values]
    table.reset_index(inplace=True)

    if error is not None:
        return {
            'dataframe': table,
            'error': error,
            'quick_fixes': quick_fixes,
        }
    else:
        return table
コード例 #46
0
ファイル: test_reshape.py プロジェクト: botplex/pandas
    def test_unstack_tuplename_in_multiindex(self):
        # GH 19966
        idx = pd.MultiIndex.from_product([["a", "b", "c"], [1, 2, 3]],
                                         names=[("A", "a"), ("B", "b")])
        df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx)
        result = df.unstack(("A", "a"))

        expected = DataFrame(
            [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]],
            columns=pd.MultiIndex.from_tuples(
                [
                    ("d", "a"),
                    ("d", "b"),
                    ("d", "c"),
                    ("e", "a"),
                    ("e", "b"),
                    ("e", "c"),
                ],
                names=[None, ("A", "a")],
            ),
            index=pd.Index([1, 2, 3], name=("B", "b")),
        )
        tm.assert_frame_equal(result, expected)
コード例 #47
0
ファイル: test_repr_info.py プロジェクト: pydata/pandas
    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product([list(uppercase),
                                            pd.date_range('20160101',
                                                          periods=N)],
                                           names=['id', 'date'])
        df = DataFrame({'value': np.random.randn(N * M)}, index=index)

        unstacked = df.unstack('id')
        assert df.values.nbytes == unstacked.values.nbytes
        assert memory_usage(df) > memory_usage(unstacked)

        # high upper bound
        assert memory_usage(unstacked) - memory_usage(df) < 2000
コード例 #48
0
ファイル: test_repr_info.py プロジェクト: zycjss/pandas
    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product(
            [list(uppercase),
             pd.date_range('20160101', periods=N)],
            names=['id', 'date'])
        df = DataFrame({'value': np.random.randn(N * M)}, index=index)

        unstacked = df.unstack('id')
        self.assertEqual(df.values.nbytes, unstacked.values.nbytes)
        self.assertTrue(memory_usage(df) > memory_usage(unstacked))

        # high upper bound
        self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)
コード例 #49
0
def plot_avg_df(df: pd.DataFrame, paper_mode: bool = False, color_group_name=None, horizontal: bool = True):
    grouping = ["Scoring", "Attribute"]
    coloring = ["Attribute"]

    legend_labels = df[coloring].drop_duplicates().iloc[::-1]
    errors = df[["STD"]]
    df = df.set_index(grouping)
    fig_size, font_size, legend_size, grid, title, label_rot, x_al = plot_style(paper_mode,
                                                                                title=f"{color_group_name} Comparison",
                                                                                horizontal=horizontal)
    unstacked_df = df.unstack()
    if horizontal:
        ax = unstacked_df["Mean"].plot.barh(title=title, grid=grid, alpha=0.85, rot=label_rot, figsize=fig_size,
                                            xerr=unstacked_df["STD"])
    else:
        ax = unstacked_df["Mean"].plot(kind="bar", title=title, grid=grid, alpha=0.85, rot=label_rot, figsize=fig_size,
                                       yerr=unstacked_df["STD"])
    axis_modification(ax, x_label="Benchmark", y_label=f"Benchmark Score", x_al=x_al, font_size=font_size,
                      horizontal=horizontal)

    legend(ax=ax, paper_mode=paper_mode, legend_labels=legend_labels, title=color_group_name, legend_size=legend_size,
           horizontal=horizontal)

    plt.show()
コード例 #50
0
ファイル: reshape.py プロジェクト: ykovarskaya/cjworkbench
def long_to_wide(
    table: pd.DataFrame, keycolnames: List[str], varcolname: str
) -> pd.DataFrame:
    varcol = table[varcolname]
    if varcol.dtype != object and not hasattr(varcol, "cat"):
        error = (
            'Column "%s" was auto-converted to Text because column names must '
            "be text." % varcolname
        )
        quick_fixes = [
            {
                "text": 'Convert "%s" to text' % varcolname,
                "action": "prependModule",
                "args": ["converttotext", {"colnames": varcolname}],
            }
        ]
        na = varcol.isnull()
        varcol = varcol.astype(str)
        varcol[na] = np.nan
        table[varcolname] = varcol
    else:
        error = None
        quick_fixes = None

    table.set_index(keycolnames + [varcolname], inplace=True, drop=True)
    if np.any(table.index.duplicated()):
        return "Cannot reshape: some variables are repeated"

    table = table.unstack()
    table.columns = [col[-1] for col in table.columns.values]
    table.reset_index(inplace=True)

    if error is not None:
        return {"dataframe": table, "error": error, "quick_fixes": quick_fixes}
    else:
        return table
コード例 #51
0
    def transform_dataframe(self, dataframe):
        """
        Use matplotlib to compute boxplot statistics on e.g. timeseries data.
        """
        grouping = self.get_grouping(dataframe)
        group_field = self.get_group_field()
        header_fields = self.get_header_fields()

        if "series" in grouping:
            # Unstack so each series is a column
            for i in range(len(header_fields) + 1):
                dataframe = dataframe.unstack()

        groups = {
            col: dataframe[col]
            for col in dataframe.columns
        }

        if "year" in grouping:
            interval = "year"
        elif "month" in grouping:
            interval = "month"
        else:
            interval = None

        # Compute stats for each column, potentially grouped by year
        all_stats = []
        for header, series in groups.items():
            if interval:
                series_stats = self.boxplots_for_interval(series, interval)
            else:
                interval = None
                series_stats = [self.compute_boxplot(series)]

            series_infos = []
            for series_stat in series_stats:
                series_info = {}
                if isinstance(header, tuple):
                    value_name = header[0]
                    col_values = header[1:]
                else:
                    value_name = header
                    col_values = []
                col_names = zip(dataframe.columns.names[1:], col_values)
                for col_name, value in col_names:
                    series_info[col_name] = value
                for stat_name, val in series_stat.items():
                    if stat_name == interval:
                        series_info[stat_name] = val
                    else:
                        series_info[value_name + '-' + stat_name] = val
                series_infos.append(series_info)
            all_stats += series_infos

        dataframe = DataFrame(all_stats)
        if 'series' in grouping:
            index = header_fields + [group_field]
            unstack = len(header_fields)
            if interval:
                index = [interval] + index
                unstack += 1
        else:
            index = [interval]
            unstack = 0

        dataframe.set_index(index, inplace=True)
        dataframe.columns.name = ''
        for i in range(unstack):
            dataframe = dataframe.unstack()

        # Remove blank columns
        dataframe = dataframe.dropna(axis=1, how='all')
        return dataframe
コード例 #52
0
ファイル: test_multilevel.py プロジェクト: ashokez/pandas
class TestMultiLevel(unittest.TestCase):

    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=Index(['A', 'B', 'C'], name='exp'))

        self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                       labels=[[0, 1, 2, 3]],
                                       names=['first'])

        # create test series object
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype('i8')
                                 for lev in self.ymd.index.levels]
        self.ymd.index.names = ['year', 'month', 'day']

    def test_append(self):
        a, b = self.frame[:5], self.frame[5:]

        result = a.append(b)
        tm.assert_frame_equal(result, self.frame)

        result = a['A'].append(b['A'])
        tm.assert_series_equal(result, self.frame['A'])

    def test_reindex_level(self):
        # axis=0
        month_sums = self.ymd.sum(level='month')
        result = month_sums.reindex(self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum)

        assert_frame_equal(result, expected)

        # Series
        result = month_sums['A'].reindex(self.ymd.index, level=1)
        expected = self.ymd['A'].groupby(level='month').transform(np.sum)
        assert_series_equal(result, expected)

        # axis=1
        month_sums = self.ymd.T.sum(axis=1, level='month')
        result = month_sums.reindex(columns=self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum).T
        assert_frame_equal(result, expected)

    def test_binops_level(self):
        def _check_op(opname):
            op = getattr(DataFrame, opname)
            month_sums = self.ymd.sum(level='month')
            result = op(self.ymd, month_sums, level='month')
            broadcasted = self.ymd.groupby(level='month').transform(np.sum)
            expected = op(self.ymd, broadcasted)
            assert_frame_equal(result, expected)

            # Series
            op = getattr(Series, opname)
            result = op(self.ymd['A'], month_sums['A'], level='month')
            broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum)
            expected = op(self.ymd['A'], broadcasted)
            assert_series_equal(result, expected)

        _check_op('sub')
        _check_op('add')
        _check_op('mul')
        _check_op('div')

    def test_pickle(self):
        import cPickle
        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_sort_index_preserve_levels(self):
        result = self.frame.sort_index()
        self.assertEquals(result.index.names, self.frame.index.names)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s[42:65]
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s[49:51]
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s[42:65]).all())
        self.assert_(notnull(s[:42]).all())
        self.assert_(notnull(s[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_fancy_2d(self):
        result = self.frame.ix['foo', 'B']
        expected = self.frame.xs('foo')['B']
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix['B', 'foo']
        expected = ft.xs('B')['foo']
        assert_series_equal(result, expected)

    def test_get_loc_single_level(self):
        s = Series(np.random.randn(len(self.single_level)),
                   index=self.single_level)
        for k in self.single_level.values:
            s[k]

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]],
                           labels=[[0, 0, 1, 1, 2, 2],
                                   [0, 1, 0, 1, 0, 1]])

        frame =  DataFrame(np.random.randn(len(index), 4), index=index,
                           columns=['a', 'b', 'c', 'd'])
        res = frame.ix[1:2]
        exp = frame[2:]
        assert_frame_equal(res, exp)

        series =  Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series[2:]
        assert_series_equal(res, exp)

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_getitem_slice_not_sorted(self):
        df = self.frame.sortlevel(1).T

        # buglet with int typechecking
        result = df.ix[:, :np.int32(3)]
        expected = df.reindex(columns=df.columns[:3])
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_frame_setitem_ix(self):
        self.frame.ix[('bar', 'two'), 'B'] = 5
        self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5)

        # with integer labels
        df = self.frame.copy()
        df.columns = range(3)
        df.ix[('bar', 'two'), 1] = 7
        self.assertEquals(df.ix[('bar', 'two'), 1], 7)

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000,2):(2000,4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception,
                          self.frame.reset_index()['A'].sortlevel)

        # preserve names
        self.assertEquals(a_sorted.index.names, self.frame.index.names)

    def test_delevel_infer_dtype(self):
        tuples = [tuple for tuple in cart_product(['foo', 'bar'],
                                                  [10, 20], [1.0, 1.1])]
        index = MultiIndex.from_tuples(tuples,
                                       names=['prm0', 'prm1', 'prm2'])
        df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'],
                       index=index)
        deleveled = df.reset_index()
        self.assert_(com.is_integer_dtype(deleveled['prm1']))
        self.assert_(com.is_float_dtype(deleveled['prm2']))

    def test_sortlevel_by_name(self):
        self.frame.index.names = ['first', 'second']
        result = self.frame.sortlevel(level='second')
        expected = self.frame.sortlevel(level=1)
        assert_frame_equal(result, expected)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)
                expected = expected.reindex_like(result).astype('i8')
                assert_frame_equal(result, expected)

        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan
        self.ymd.ix[1, [1, 2]] = np.nan
        self.ymd.ix[7, [0, 1]] = np.nan

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

        self.frame['D'] = 'foo'
        result = self.frame.count(level=0, numeric_only=True)
        assert_almost_equal(result.columns, ['A', 'B', 'C'])

    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2],
                                   [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

    def test_count_level_corner(self):
        s = self.frame['A'][:0]
        result = s.count(level=0)
        expected = Series(0, index=s.index.levels[0])
        assert_series_equal(result, expected)

        df = self.frame[:0]
        result = df.count(level=0)
        expected = DataFrame({}, index=s.index.levels[0],
                             columns=df.columns).fillna(0).astype(int)
        assert_frame_equal(result, expected)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

        # stack with negative number
        result = self.ymd.unstack(0).stack(-2)
        expected = self.ymd.unstack(0).stack(0)

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked['foo'], df['foo'].stack())
        self.assert_(stacked['bar'].dtype == np.float_)

    def test_unstack_bug(self):
        df = DataFrame({'state': ['naive','naive','naive',
                                  'activ','activ','activ'],
                        'exp':['a','b','b','b','a','a'],
                        'barcode':[1,2,3,4,1,3],
                        'v':['hi','hi','bye','bye','bye','peace'],
                        'extra': np.arange(6.)})

        result = df.groupby(['state','exp','barcode','v']).apply(len)
        unstacked = result.unstack()
        restacked = unstacked.stack()
        assert_series_equal(restacked,
                            result.reindex(restacked.index).astype(float))

    def test_stack_unstack_preserve_names(self):
        unstacked = self.frame.unstack()
        self.assertEquals(unstacked.index.name, 'first')
        self.assertEquals(unstacked.columns.names, ['exp', 'second'])

        restacked = unstacked.stack()
        self.assertEquals(restacked.index.names, self.frame.index.names)

    def test_unstack_level_name(self):
        result = self.frame.unstack('second')
        expected = self.frame.unstack(level=1)
        assert_frame_equal(result, expected)

    def test_stack_level_name(self):
        unstacked = self.frame.unstack('second')
        result = unstacked.stack('exp')
        expected = self.frame.unstack().stack(0)
        assert_frame_equal(result, expected)

        result = self.frame.stack('exp')
        expected = self.frame.stack()
        assert_series_equal(result, expected)

    def test_stack_unstack_multiple(self):
        unstacked = self.ymd.unstack(['year', 'month'])
        expected = self.ymd.unstack('year').unstack('month')
        assert_frame_equal(unstacked, expected)
        self.assertEquals(unstacked.columns.names,
                          expected.columns.names)

        # series
        s = self.ymd['A']
        s_unstacked = s.unstack(['year', 'month'])
        assert_frame_equal(s_unstacked, expected['A'])

        restacked = unstacked.stack(['year', 'month'])
        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
        restacked = restacked.sortlevel(0)

        assert_frame_equal(restacked, self.ymd)
        self.assertEquals(restacked.index.names, self.ymd.index.names)

        # GH #451
        unstacked = self.ymd.unstack([1, 2])
        expected = self.ymd.unstack(1).unstack(1)
        assert_frame_equal(unstacked, expected)

        unstacked = self.ymd.unstack([2, 1])
        expected = self.ymd.unstack(2).unstack(1)
        assert_frame_equal(unstacked, expected)

    def test_groupby_transform(self):
        s = self.frame['A']
        grouper = s.index.get_level_values(0)

        grouped = s.groupby(grouper)

        applied = grouped.apply(lambda x: x * 2)
        expected = grouped.transform(lambda x: x * 2)
        assert_series_equal(applied.reindex(expected.index), expected)

    def test_join(self):
        a = self.frame.ix[:5, ['A']]
        b = self.frame.ix[2:, ['B', 'C']]

        joined = a.join(b, how='outer').reindex(self.frame.index)
        expected = self.frame.copy()
        expected.values[np.isnan(joined.values)] = np.nan

        self.assert_(not np.isnan(joined.values).all())

        assert_frame_equal(joined, expected)

    def test_swaplevel(self):
        swapped = self.frame['A'].swaplevel(0, 1)
        swapped2 = self.frame['A'].swaplevel('first', 'second')
        self.assert_(not swapped.index.equals(self.frame.index))
        assert_series_equal(swapped, swapped2)

        back = swapped.swaplevel(0, 1)
        back2 = swapped.swaplevel('second', 'first')
        self.assert_(back.index.equals(self.frame.index))
        assert_series_equal(back, back2)

        ft = self.frame.T
        swapped = ft.swaplevel('first', 'second', axis=1)
        exp = self.frame.swaplevel('first', 'second').T
        assert_frame_equal(swapped, exp)

    def test_swaplevel_panel(self):
        panel = Panel({'ItemA' : self.frame,
                       'ItemB' : self.frame * 2})

        result = panel.swaplevel(0, 1, axis='major')
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)

    def test_reorder_levels(self):
        result = self.ymd.reorder_levels(['month', 'day', 'year'])
        expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
        assert_frame_equal(result, expected)

        result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
        expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
        assert_series_equal(result, expected)

        result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
        expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
        assert_frame_equal(result, expected)

        self.assertRaises(Exception, self.ymd.index.reorder_levels,
                          [1, 2, 3])

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        x = Series(data=[1,2,3],
                   index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)]))

        y = Series(data=[4,5,6],
                   index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 1, 0, 1, 1],
                                   [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df['foo'].values[:] = 0
        self.assert_((df['foo'].values == 0).all())

        # but not if it's mixed-type
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(0, axis=1)
        df['foo']['one'] = 2
        self.assert_((df['foo', 'one'] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df['foo']
        result2 = df.ix[:, 'foo']
        expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs('foo')
        result2 = df.ix['foo']
        expected = df.reindex(df.index[arrays[0] == 'foo'])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
        ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s['qux']
        result2 = s.ix['qux']
        expected = s[arrays[0] == 'qux']
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

    AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
                     'mad', 'std', 'var']

    def test_series_group_min_max(self):
        for op, level, skipna in cart_product(self.AGG_FUNCTIONS,
                                              range(2),
                                              [False, True]):
            grouped = self.series.groupby(level=level)
            aggf = lambda x: getattr(x, op)(skipna=skipna)
            # skipna=True
            leftside = grouped.agg(aggf)
            rightside = getattr(self.series, op)(level=level, skipna=skipna)
            assert_series_equal(leftside, rightside)

    def test_frame_group_ops(self):
        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan

        for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
                                                    range(2), range(2),
                                                    [False, True]):
            if axis == 0:
                frame = self.frame
            else:
                frame = self.frame.T

            grouped = frame.groupby(level=level, axis=axis)

            aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis)
            leftside = grouped.agg(aggf)
            rightside = getattr(frame, op)(level=level, axis=axis,
                                           skipna=skipna)

            # for good measure, groupby detail
            level_index = frame._get_axis(axis).levels[level]

            self.assert_(leftside._get_axis(axis).equals(level_index))
            self.assert_(rightside._get_axis(axis).equals(level_index))

            assert_frame_equal(leftside, rightside)

    def test_frame_series_agg_multiple_levels(self):
        result = self.ymd.sum(level=['year', 'month'])
        expected = self.ymd.groupby(level=['year', 'month']).sum()
        assert_frame_equal(result, expected)

        result = self.ymd['A'].sum(level=['year', 'month'])
        expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
        assert_series_equal(result, expected)

    def test_groupby_multilevel(self):
        result = self.ymd.groupby(level=[0, 1]).mean()

        k1 = self.ymd.index.get_level_values(0)
        k2 = self.ymd.index.get_level_values(1)

        expected = self.ymd.groupby([k1, k2]).mean()

        assert_frame_equal(result, expected)
        self.assertEquals(result.index.names, self.ymd.index.names[:2])

        result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
        assert_frame_equal(result, result2)

    def test_groupby_multilevel_with_transform(self):
        pass

    def test_multilevel_consolidate(self):
        index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                        ('bar', 'one'), ('bar', 'two')])
        df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
        df['Totals', ''] = df.sum(1)
        df = df.consolidate()

    def test_ix_preserve_names(self):
        result = self.ymd.ix[2000]
        result2 = self.ymd['A'].ix[2000]
        self.assertEquals(result.index.names, self.ymd.index.names[1:])
        self.assertEquals(result2.index.names, self.ymd.index.names[1:])

        result = self.ymd.ix[2000, 2]
        result2 = self.ymd['A'].ix[2000, 2]
        self.assertEquals(result.index.name, self.ymd.index.names[2])
        self.assertEquals(result2.index.name, self.ymd.index.names[2])

    def test_partial_set(self):
        # GH #397
        df = self.ymd.copy()
        exp = self.ymd.copy()
        df.ix[2000, 4] = 0
        exp.ix[2000, 4].values[:] = 0
        assert_frame_equal(df, exp)

        df['A'].ix[2000, 4] = 1
        exp['A'].ix[2000, 4].values[:] = 1
        assert_frame_equal(df, exp)

        df.ix[2000] = 5
        exp.ix[2000].values[:] = 5
        assert_frame_equal(df, exp)

        # this works...for now
        df['A'].ix[14] = 5
        self.assertEquals(df['A'][14], 5)

    def test_unstack_preserve_types(self):
        # GH #403
        self.ymd['E'] = 'foo'
        self.ymd['F'] = 2

        unstacked = self.ymd.unstack('month')
        self.assert_(unstacked['A', 1].dtype == np.float64)
        self.assert_(unstacked['E', 1].dtype == np.object_)
        self.assert_(unstacked['F', 1].dtype == np.float64)

    def test_partial_ix_missing(self):
        result = self.ymd.ix[2000, 0]
        expected = self.ymd.ix[2000]['A']
        assert_series_equal(result, expected)

        # need to put in some work here

        # self.ymd.ix[2000, 0] = 0
        # self.assert_((self.ymd.ix[2000]['A'] == 0).all())

        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6))
        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0)

    def test_to_html(self):
        self.ymd.columns.name = 'foo'
        self.ymd.to_html()
        self.ymd.T.to_html()
コード例 #53
0

#casetovars varstocases

d = {'one':[1,1],'two':[2,2]}
i = ['a','b']

# Create dataframe
df = DataFrame(data = d, index = i)
df

#varstocases
df.stack()

#casestoVars
df.unstack()



#aggregate
d = {'one':[1,1,1,1,1],'two':[2,2,2,2,2],'letter':['a','a','b','b','c']}

# Create dataframe
df = DataFrame(d)
df

one = df.groupby('letter')

# Apply sum function
one.sum()
letterone = df.groupby(['letter','one']).sum()
コード例 #54
0
ファイル: ch07-5.py プロジェクト: wwxFromTju/Python_learn_way
print data

# 使用stack方法,将列旋转为行,得到一个Series
result = data.stack()
print result

# 对于一个层次化的Series,可以使用unstack来重排为一个DataFrame
# 默认情况是最内层
print result.unstack()
# 可以通过传入参数分层级别的编号或者名称来对别的级别的unstack操作
print result.unstack(0)
print result.unstack('state')

# 如果不是所有级别值都能在各分组中找到的话,那么unstack会引入缺失值
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print data2
print data2.unstack()

# stack会滤除缺失数据,所以该运算是可逆的
print data2.unstack().stack()
print data2.unstack().stack(dropna=False)

# 对DataFrame进行unstack进行操作时, 被旋转的会变成最内层
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
print df.unstack('state')
print df.unstack('state').stack('side')

コード例 #55
0
result.unstack(0)
result.unstack('state')
s
s1
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2=Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2
data2.unstrack()
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna=False)
df = DataFrame({'left': result,'right': result+5},colums = pd.Index(['left','right'],name='side'))
df = DataFrame({'left': result,'right': result+5},columns = pd.Index(['left','right'],name='side'))
df
df.unstack('state')
df.unstack('side')
df.unstack('state').stack('side')
ldata[:10]
!vi pythonio.py
data = DataFrame({'k1':['one'] * 3 + ['two'] * 4, 'k2':[1,1,2,3,3,4,4]})
data
data.duplicated()
data.drop_duplicated()
data.drop_duplicates()
data['v1'] = range(7)
data.drop_duplicates(['k1'])
data.drop_duplicates(['k1','k2'],take_last=True)
data.drop_duplicates(['k1','k2'],keep='last')
data = DataFrame({'food':['bacon','pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7,.5,8,3,5,6]})
data = DataFrame({'food':['bacon','pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
コード例 #56
0
ファイル: pydata.py プロジェクト: kindlychung/pyQCDH
    """
    :param obj:
    :type obj: pandas.core.frame.DataFrame
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    return df3.corrwith(obj)
df1.apply(corr_df3)
df1.apply(lambda x: df3.corrwith(x))
df3.apply(lambda x: df1.corrwith(x))

df3.index
df3
df3.ix["b", ].ix[1:, ]
df3.ix["a":"b",].ix[1:, ]
df3.unstack().unstack()

frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
frame.swaplevel("key1", "key2")
frame
frame.sortlevel(0)
frame
frame.sum(level="key1")
コード例 #57
0
# 9.3 Dataframe多层索引切片
df7 = DataFrame(np.random.randint(0, 150, size=(8, 2)),
                index=pd.MultiIndex.from_product([list("abcd"), ["期中", "期末"]]),
                columns=["Python", "高数"])
df7.loc['a':'c']
df7['a':'c']
'''10 索引的堆(stack)!'''
'''stack()  小技巧:使用stack()的时候,level等于哪一个,哪一个就消失,出现在行里。
   unstack()'''
# 10.1 索引的堆(stack)---将列索引堆至行索引
df5.stack(level=0)  # 第1层消失,出现在行里
df5.stack(level=1)  # 第2层消失,出现在行里
df5.stack(level=2)  # 第3层消失,出现在行里

# 10.2 反堆---就是第n层消失,出现在列里
df4.unstack(level=0)  # 第1层消失,出现在列里
df5.stack(level=1)  # 第2层消失,出现在列里
df5.stack(level=2)  # 第3层消失,出现在列里

# 10.3 unstack---将行索引堆至列索引
df7.unstack()  # 将行索引堆至列索引
df7.unstack(level=0)  # 将行索引堆至列索引
df7.unstack(level=1)  # 将行索引堆至列索引
df7.unstack(level=-1)  # 将行索引堆至列索引

# 聚合操作
'''小技巧:和unstack()相反,聚合的时候,axis等于哪一个,哪一个就会进行计算'''
df3.mean()
df3.mean(axis=0)  # 列平均
df3.mean(axis=1)  # 行平均
df3.mean(axis='index')  # 列平均
コード例 #58
0
    lr_auc_test, svc_auc_test, rf_auc_test, knn_auc_test, gaussian_auc_test
]
models = DataFrame({
    'Training Accuracy': train_acc,
    'Testing Accuracy': test_acc,
    "Cross-Validation Accuracy": cross_val_acc,
    'Training AUC': train_auc,
    'Testing AUC': test_auc
})
models.index = [
    'Logistic Regression', 'Support Vector Machines ', 'Random Forests',
    'K-Nearest Neighbors', 'Gaussian Naive Bayes'
]
models

models1 = DataFrame({'Accuracy': models.unstack()}).reset_index()
# plot accuracies
plt.figure(figsize=(8, 7))
fig_models = sns.barplot(x='level_0',
                         y='Accuracy',
                         hue='level_1',
                         data=models1)
fig_models.set(xlabel='Accuracy Metric', ylabel='Accuracy')
fig_models.set_title('The Accuracy of All Models Over Five Metrics')

x = zip(X_train.columns, np.transpose(logreg.coef_))
x1 = pd.DataFrame(list(x))
x1.head()

# get Correlation Coefficient for each feature using Logistic Regression
logreg_df = pd.DataFrame(list(zip(X_train.columns,
コード例 #59
0
#encoding:utf-8
from pandas import Series,DataFrame
a=[['刘玄德','男','语文',98.],['刘玄德','男','体育',60.],['关云长','男','数学',60.],['张飞','女','语文',100.],['关云长','男','语文',100.]]
af=DataFrame(a,columns=['name','sex','course','score'])
af=af.sort(['name'])
print af
af.set_index(['name','sex','course'],inplace='TRUE')
print af
t1=af.unstack(level=2)
print t1
t2=t1.mean(axis=1,skipna=True)
t1['平均分']=t2
t1.fillna(0)