Example #1
    def test_pivot_table_dropna(self):
        df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
                        'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
                        'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
                        'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
                        'quantity': {0: 2000000, 1: 500000,
                                     2: 1000000, 3: 1000000}})
        pv_col = df.pivot_table('quantity', 'month', [
                                'customer', 'product'], dropna=False)
        pv_ind = df.pivot_table(
            'quantity', ['customer', 'product'], 'month', dropna=False)

        m = MultiIndex.from_tuples([(u('A'), u('a')),
                                    (u('A'), u('b')),
                                    (u('A'), u('c')),
                                    (u('A'), u('d')),
                                    (u('B'), u('a')),
                                    (u('B'), u('b')),
                                    (u('B'), u('c')),
                                    (u('B'), u('d')),
                                    (u('C'), u('a')),
                                    (u('C'), u('b')),
                                    (u('C'), u('c')),
                                    (u('C'), u('d'))])

        assert_equal(pv_col.columns.values, m.values)
        assert_equal(pv_ind.index.values, m.values)
Example #2
    def test_to_html_regression_GH6098(self):
        df = DataFrame({
            u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')],
            u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')],
            'données1': np.random.randn(5),
            'données2': np.random.randn(5)})

        # it works
        df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
Example #3
    def test_pivot_table_nocols(self):
        df = DataFrame({"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]})
        rs = df.pivot_table(columns="cols", aggfunc=np.sum)
        xp = df.pivot_table(index="cols", aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"})
        xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T
        tm.assert_frame_equal(rs, xp)
Example #4
def test_to_html_regression_GH6098():
    df = DataFrame({
        'clé1': ['a', 'a', 'b', 'b', 'a'],
        'clé2': ['1er', '2ème', '1er', '2ème', '1er'],
        'données1': np.random.randn(5),
        'données2': np.random.randn(5)})

    # it works
    df.pivot_table(index=['clé1'], columns=['clé2'])._repr_html_()
Example #5
    def test_pivot_table_nocols(self):
        df = DataFrame({'rows': ['a', 'b', 'c'],
                        'cols': ['x', 'y', 'z'],
                        'values': [1,2,3]})
        rs = df.pivot_table(columns='cols', aggfunc=np.sum)
        xp = df.pivot_table(index='cols', aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'})
        xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T
        tm.assert_frame_equal(rs, xp)
Example #6
def get_heatmap_data(ans: pd.DataFrame):
    """Sorts the values for the heatmap to make it more intuitive and readable
    Expects a dataframe `ans` containing processed (clean) barcode read counts data
    Expects a dataframe `flag`"""
    # generate heatmap matrix of (logged) read counts per sample per paired read
    hmap = (ans.pivot_table(index=["sample"],
                                [np.inf, -np.inf], np.nan).fillna(0))
    # drop unknown-unknown reads only
    hmap = hmap.drop(columns='unknown-unknown')
    # sort values to make plot more intuitive
    hmap['max_idx'] = hmap.apply(
        lambda x: hmap.columns.tolist().index(x.idxmax()), axis=1)
    hmap = hmap.sort_values('max_idx').drop(columns=['max_idx'])
    # grab read counts
    counts = hmap.values
    # prepare data for identifying potential contaminants
    flag = (ans.groupby('sample').agg(
        uniq_forward_bcodes=('forward_barcode', get_unique_barcodes),
        uniq_reverse_bcodes=('reverse_barcode', get_unique_barcodes)))

    # create boolean column that identifies potential contamination
    flag['contamination'] = flag.apply(is_contaminant, axis=1)
    # merge with original data to include the contamination flags
    contaminants_flag = (hmap.join(flag, how='inner')['contamination'].apply(
        lambda x: np.where(x == True, 1.0, 0.0)))
    # add contaminant flag column to the read counts
    data = np.hstack((counts, contaminants_flag[:, np.newaxis]))
    # list of all sample IDs
    x = hmap.index.values
    # list of all paired reads and an extra flag column for contamination
    y = hmap.columns.tolist()
    return hmap, data, x, y
Example #7
def pivot_party_votes_df(df: pd.DataFrame):
    return df.pivot_table(
        index=["Fraktion/Gruppe", "date", "title"],
def barplot(df: pd.DataFrame, column: str, show: bool, save_location: str):
    plt.figure(figsize=(20, 20))
    df = df.pivot_table(index=[column], aggfunc="size").sort_values()
    sns.barplot(x=df.index, y=df.values)
    if show:
    def upload_data_long_format_as_single_data_set(
        data: pd.DataFrame,
        name: str,
        cross_section_column_name: str,
        date_column_name: str,
        replace_missing_values: bool = True,
        forward_fill_missing_values: bool = False,
    ) -> IndividualDataset:
        Uploads long format data into Horizon. The data frame should have a date column, with a numeric index.

        :param data: The dataset in a pandas data frame. Must have a valid date column.
        :param name: Name of the data set to be uploaded
        :param cross_section_column_name: The identifier column that groups the records
        :param date_column_name: The column name of the date index.
        :param forward_fill_missing_values: Forward-fill missing values
        :param replace_missing_values: Replace missing values
        :return: A summary of the uploaded data set.
        :param encode_categorical_data: Categorically encode data that is non-numeric
        :param max_categories: Maximum number of categories per series.

        df = data.pivot_table(columns=cross_section_column_name,
        df.columns = ["/".join(column) for column in df.columns]

        return self.upload_data(
Example #10
    def __pivot_table(self, data: pd.DataFrame) -> pd.DataFrame:
        Pivot the thc result by session/topic in rows and team in columns.
            pivot_table = data.pivot_table(
                index=["session", "topic"],
                aggfunc=lambda x: x,
        except AttributeError as e:
                f"Error creating pivot table for THC result. The error was {e}."
            pivot_table = pd.DataFrame()
        except ValueError as e:
                "Error creating pivot table for THC result."
                f"Possibly the database is inconsistent. Error was {e}.")
            # TODO: This raises a ValueError: Function does not reduce in case
            # the database has non-unique entries. Can we recover from this?
            pivot_table = pd.DataFrame()
            pivot_table.fillna(THCResult.NoResult, inplace=True)

        return pivot_table
Example #11
def getPivotMedian(dataFrame:pandas.DataFrame,valueColumns,indexColumns):
    Return pivot meadian dataframe of source dataframe
    if dataFrame is None:
        return None
    return dataFrame.pivot_table(values=valueColumns,index=indexColumns,aggfunc=numpy.median)
Example #12
    def test_type_error_multiindex(self):
        # See gh-12218
        df = DataFrame(
            columns=["i", "c", "x", "y"],
            data=[[0, 0, 1, 2], [1, 0, 3, 4], [0, 1, 1, 2], [1, 1, 3, 4]],
        dg = df.pivot_table(index="i", columns="c", values=["x", "y"])
        # TODO: Is this test for pivot_table?
        with pytest.raises(TypeError, match="unhashable type"):
            dg[:, 0]

        index = Index(range(2), name="i")
        columns = MultiIndex(
            levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"]
        expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index)

        result = dg.loc[:, (slice(None), 0)]
        tm.assert_frame_equal(result, expected)

        name = ("x", 0)
        index = Index(range(2), name="i")
        expected = Series([1, 3], index=index, name=name)

        result = dg["x", 0]
        tm.assert_series_equal(result, expected)
Example #13
def _reshape_df(df: pd.DataFrame) -> pd.DataFrame:
    Reshape a DataFrame from long to wide form, adhering to the following

    - If the `meta_date` column exists, replace `variable` column with
      {variable}_{meta_date} and then drop `meta_date`
    - Construct a pivot_table where the columns come from the `variable`
      column, values come from the `value` column, and all other columns are
      used as an index
    if df.shape[0] == 0:
        # empty dataframe
        return df

    cols = list(df)
    for c in ["variable", "value"]:
        if c not in cols:
            gh_issues = "https://github.com/valorumdata/cmdc.py/issues/new"
            msg = (f"Column {c} not found in DataFrame. "
                   f"Please report a bug at {gh_issues}")
            raise ValueError(msg)
    if "meta_date" in cols:
        if "variable" in cols:
            df["variable"] = (df["variable"].astype(str) + "_" +
            df.drop("meta_date", axis="columns")

    idx = list(set(cols) - {"variable", "value"})
    return df.pivot_table(index=idx, columns="variable",
Example #14
    def test_drop_multiindex_not_lexsorted(self):
        # GH#11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples([("a", ""), ("b1", "c1"),
                                               ("b2", "c2")],
                                              names=["b", "c"])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        assert lexsorted_df.columns.is_lexsorted()

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=["a", "b", "c", "d"],
                                     data=[[1, "b1", "c1", 3],
                                           [1, "b2", "c2", 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(index="a",
                                                        columns=["b", "c"],
        not_lexsorted_df = not_lexsorted_df.reset_index()
        assert not not_lexsorted_df.columns.is_lexsorted()

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop("a", axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop("a", axis=1)

        tm.assert_frame_equal(result, expected)
Example #15
    def _aggregate_proj_points_data(self,
                                    df_score: pd.DataFrame) -> pd.DataFrame:
        """ Returns the Projected Points data aggregated to the Season/League/Week level """

        df_score = df_score.copy()

        df_score['starter_ind'] = np.where(df_score['Pos'] == 'Bench', 0, 1)

        # Including the starter indiciator in order to include starter and bench points
        groupby_vars = [
            'season_id', 'league_id', 'Week', 'Team', 'starter_ind'
        sum_vars = ['Proj', 'Actual']
        df_score = df_score.groupby(groupby_vars,

        id_vars = ['season_id', 'league_id', 'Week', 'Team']
        df_score = df_score.pivot_table(index=id_vars,

        # Pivot table causes multi-dimensional column names and id_vars become the index
        df_score.columns = [
            '{}_{}'.format(x[0], 'Starter') if x[1] == 1 else '{}_{}'.format(
                x[0], 'Bench') for x in df_score.columns
        df_score = df_score.reset_index().rename_axis(None, axis=1)

        final_df = df_score

        return final_df
Example #16
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples([('a', ''), ('b1', 'c1'),
                                               ('b2', 'c2')],
                                              names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(index='a',
                                                        columns=['b', 'c'],
        not_lexsorted_df = not_lexsorted_df.reset_index()

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Example #17
def _plot_and_save(
    df: pd.DataFrame,
    index: str,
    column: str,
    index_label: str,
    column_label: str,
    values: str = 'ret',
    add_sep_colorbar: bool = True,
    norm: colors.Normalize = None,
    save_figure: bool = False,
    save_dir: str = None
    if index in df.columns and column in df.columns:
        # Pivot table (by default averages over identical index / columns cells)
        df_pivot = df.pivot_table(index=index, columns=column, values=values)

        # Generate the plot
        fig_hm, fig_cb = render_heatmap(df_pivot, add_sep_colorbar=add_sep_colorbar, norm=norm,
                                        y_label=index_label, x_label=column_label)

        # Save heat map and color bar if desired
        if save_figure:
            name = '-'.join([index, column])
            fig_hm.savefig(osp.join(save_dir, f'hm-{name}.pdf'))
            if fig_cb is not None:
                fig_cb.savefig(osp.join(save_dir, f'cb-{name}.pdf'))
Example #18
def Process(n_states, filepath):
    rdr = csv.reader(open(filepath), delimiter=',')
    datacols = defaultdict(list)

    for ag, ev, ob, tr in rdr:

    df = DataFrame(datacols)
    MV_result = []
    for i in range(0, len(np.unique(df['events']))):
        mcount = df[df['events'] == i][['agents', 'observations'
    df2 = df.pivot_table(index='agents',
    #df3 = df2.replace(0,n_states).fillna(0) #(0,6)
    #MV_result=[x if x!=0 else n_states for x in MV_result] # x if x!=0 else 6

    GroundTruth = df[['events', 'truths']].drop_duplicates().sort_values(
        ['events'])  #.replace(0,n_states)
    return df2, MV_result, GroundTruth
Example #19
Example #20
def prepareBreakagebreakageSummary(breakageData, stlSalesSamePeriod, kcSalesSamePeriod, reportYear, lastYear):
    Takes in clean data and gets it ready for consumption
    aggFuncs = { 'Breakage|Dollars' : np.sum,
                'Breakage|Cases' : np.sum }
    groupCols = ['Warehouse','ReasonCode','Year']
    breakageSummary = DataFrame(breakageData.groupby(groupCols).agg(aggFuncs).reset_index(drop=False))
    breakageSummary = pd.DataFrame(breakageSummary.pivot_table(values=['Breakage|Cases','Breakage|Dollars'], index=['Warehouse','ReasonCode'], columns=['Year']))
    breakageSummary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in breakageSummary.columns]  
    breakageSummary.sort_index(inplace=True, ascending=False)
    breakageSummary['Breakage|% Sales'] = breakageSummary.index.get_level_values(0)
    breakageSummary['Breakage|% Sales'] = breakageSummary['Breakage|% Sales'].map({'Kansas City':kcSalesSamePeriod, 'Saint Louis':stlSalesSamePeriod})
    breakageSummary['Breakage|% Sales'] = np.divide(breakageSummary['Breakage|Dollars|2016'], breakageSummary['Breakage|% Sales'])
    def yoy_delta(now, then): return np.divide(np.subtract(now,then), then)
    breakageSummary['Breakage|Dollars|% Change'] = round(yoy_delta(breakageSummary['Breakage|Dollars|'+str(reportYear)], breakageSummary['Breakage|Dollars|'+str(lastYear)]),4)
    breakageSummary['Breakage|Cases|% Change'] = round(yoy_delta(breakageSummary['Breakage|Cases|'+str(reportYear)], breakageSummary['Breakage|Cases|'+str(lastYear)]),4)
    breakageSummary = breakageSummary.reindex(columns=['Breakage|Dollars|'+str(lastYear), 'Breakage|Dollars|'+str(reportYear), 'Breakage|Dollars|% Change', 'Breakage|% Sales',
                                        'Breakage|Cases|'+str(lastYear), 'Breakage|Cases|'+str(reportYear), 'Breakage|Cases|% Change'])
    breakageSummary = breakageSummary.reindex(index=['Warehouse Breakage','Cross-Dock Breakage','Driver Breakage','Supplier Breakage','Sales Breakage & Unsaleables'], level='ReasonCode')

    return breakageSummary
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Example #22
def check_count(df: pd.DataFrame):
    df_group_by = df.pivot_table(values='SITE',
    df_count_check = df_group_by[df_group_by['SITE'] <= 10900]

    return df_count_check
Example #23
def plot_hyper_comparison(all_dfs: pd.DataFrame, hyper_param: str, ys: List[str], x: str='iteration', *args, **kwargs):
    compare_df = all_dfs.pivot_table(index=x, columns=[hyper_param], values=ys, aggfunc='first').reset_index()

    compare_df.columns = compare_df.columns.to_flat_index()

    hyper_values = all_dfs[hyper_param].unique()
    return plot_fields(compare_df, x=(x, ''),
                       ys=[(field, hyper_value) for hyper_value in hyper_values for field in ys], *args, **kwargs)
Example #25
Example #26
class PivotTable(object):

    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({'key1': fac1.take(ind1),
                             'key2': fac2.take(ind2),
                             'key3': fac2.take(ind2),
                             'value1': np.random.randn(N),
                             'value2': np.random.randn(N),
                             'value3': np.random.randn(N)})

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])
Example #27
Example #28
Example #29
Example #30
Example #31
 def execute(self, df: DataFrame, domain_retriever=None, execute_pipeline=None) -> DataFrame:
     pivoted_df = df.pivot_table(
         aggfunc='mean' if self.agg_function == 'avg' else self.agg_function,
     pivoted_df.columns.name = None
     return pivoted_df
Example #32
Example #33
class PivotTable(object):
    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({
            'key1': fac1.take(ind1),
            'key2': fac2.take(ind2),
            'key3': fac2.take(ind2),
            'value1': np.random.randn(N),
            'value2': np.random.randn(N),
            'value3': np.random.randn(N)

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])
Example #34
Example #35
def make_pivot_table(data: pd.DataFrame) -> pd.DataFrame:
    values = get_names(STATISTICS)

    report = data.pivot_table(index=get_names(PARAMETERS),

    return report.reindex(values, axis=1)
Example #37
 def _get_lof_features(self, ratings: pd.DataFrame):
     rating_mx = ratings.pivot_table(index=self.USER_ID_COL,
     clf = LocalOutlierFactor(n_neighbors=30, metric='euclidean')
     user_lof = pd.DataFrame(clf.negative_outlier_factor_)
     user_lof.columns = ['LOF']
     user_lof[self.USER_ID_COL] = rating_mx.index
     return user_lof
Example #40
Example #41
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)
Example #42
    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
                          'value': np.random.randn(16)})

        table = data.pivot_table('value', index='a', columns=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)
Example #43
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = date.min
        data = list(
            product(["foo", "bar"], ["A", "B", "C"], ["x1", "x2"], [d + timedelta(i) for i in range(20)], [1.0])
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"])

        tm.assert_frame_equal(table, table2, check_names=False)
Example #44
Example #45
class PivotTable:

    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({'key1': fac1.take(ind1),
                             'key2': fac2.take(ind2),
                             'key3': fac2.take(ind2),
                             'value1': np.random.randn(N),
                             'value2': np.random.randn(N),
                             'value3': np.random.randn(N)})
        self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
                              'col3': [1, 2, 3, 4, 5]})
        self.df2.col1 = self.df2.col1.astype('category')
        self.df2.col2 = self.df2.col2.astype('category')

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])

    def time_pivot_table_agg(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'],
                            aggfunc=['sum', 'mean'])

    def time_pivot_table_margins(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'],

    def time_pivot_table_categorical(self):
        self.df2.pivot_table(index='col1', values='col3', columns='col2',
                             aggfunc=np.sum, fill_value=0)

    def time_pivot_table_categorical_observed(self):
        self.df2.pivot_table(index='col1', values='col3', columns='col2',
                             aggfunc=np.sum, fill_value=0, observed=True)
Example #46
    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype(
                ("Index", object),
                ("Symbol", object),
                ("Year", int),
                ("Month", int),
                ("Day", int),
                ("Quantity", int),
                ("Price", float),

        products = np.array(
                ("SP500", "ADBE"),
                ("SP500", "NVDA"),
                ("SP500", "ORCL"),
                ("NDQ100", "AAPL"),
                ("NDQ100", "MSFT"),
                ("NDQ100", "GOOG"),
                ("FTSE", "DGE.L"),
                ("FTSE", "TSCO.L"),
                ("FTSE", "GSK.L"),
            dtype=[("Index", object), ("Symbol", object)],
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items["Index"] = products["Index"][iproduct]
        items["Symbol"] = products["Symbol"][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items["Year"] = dates.year
        items["Month"] = dates.month
        items["Day"] = dates.day
        items["Price"] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table("Price", rows=["Month", "Day"], cols=["Index", "Symbol", "Year"], aggfunc="mean")

Example #47
    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ], dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1),
                               datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price', rows=['Month', 'Day'],
                                 cols=['Index', 'Symbol', 'Year'],

Example #48
    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame(
                "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2,
                "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                "c": (["foo"] * 4 + ["bar"] * 4) * 2,
                "value": np.random.randn(16),

        table = data.pivot_table("value", index="a", columns=["b", "c"])

        grouped = data.groupby(["a", "b", "c"])["value"].mean()
        expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all")
        tm.assert_frame_equal(table, expected)
Example #51
Example #52
    def test_margins(self):
        def _check_output(res, col, index=['A', 'B'], columns=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(index)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            res = res.sortlevel()
            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(columns)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,

        # no rows
        rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
        tm.assert_isinstance(rtable, Series)
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

        # issue number #8349: pivot_table with margins and dictionary aggfunc

        df=DataFrame([  {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ])


        rs=df.pivot_table(  index=['JOB','NAME'],


        tm.assert_frame_equal(rs['DAYS'], ex['DAYS'])


        tm.assert_frame_equal(rs['SALARY'], ex['SALARY'])
Example #53
Example #55
Example #56
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, margins_name='All', dropna=True,
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    margins_name : string, default 'All'
        Name of the row / column that will contain the totals
        when margins is True.

        .. versionadded:: 0.21.0

    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1

    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
    >>> b = np.array(["one", "one", "one", "two", "one", "one",
    ...               "one", "two", "two", "two", "one"], dtype=object)
    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
    ...               dtype=object)

    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    ... # doctest: +NORMALIZE_WHITESPACE
    b   one        two
    c   dull shiny dull shiny
    bar    1     2    1     0
    foo    2     2    1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
                            # and will not be shown in the output because
                            # dropna is True by default. Set 'dropna=False'
                            # to preserve categories with no data
    ... # doctest: +SKIP
    col_0  d  e
    a      1  0
    b      0  1

    >>> crosstab(foo, bar, dropna=False)  # 'c' and 'f' are not represented
                            # in the data, but they still will be counted
                            # and shown in the output
    ... # doctest: +SKIP
    col_0  d  e  f
    a      1  0  0
    b      0  1  0
    c      0  0  0

    crosstab : DataFrame

    index = com.maybe_make_list(index)
    columns = com.maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    common_idx = _get_objs_combined_axis(index + columns, intersect=True,

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    from pandas import DataFrame
    df = DataFrame(data, index=common_idx)
    if values is None:
        df['__dummy__'] = 0
        kwargs = {'aggfunc': len, 'fill_value': 0}
        df['__dummy__'] = values
        kwargs = {'aggfunc': aggfunc}

    table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                           margins=margins, margins_name=margins_name,
                           dropna=dropna, **kwargs)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins,

    return table
Example #57
Example #58
Example #59
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True):
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN

    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    bar  1     2      1     0
    foo  2     2      1     2

    crosstab : DataFrame

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        return table.fillna(0).astype(np.int64)
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)
        return table