def test_pivot_table_dropna(self): df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) pv_col = df.pivot_table('quantity', 'month', [ 'customer', 'product'], dropna=False) pv_ind = df.pivot_table( 'quantity', ['customer', 'product'], 'month', dropna=False) m = MultiIndex.from_tuples([(u('A'), u('a')), (u('A'), u('b')), (u('A'), u('c')), (u('A'), u('d')), (u('B'), u('a')), (u('B'), u('b')), (u('B'), u('c')), (u('B'), u('d')), (u('C'), u('a')), (u('C'), u('b')), (u('C'), u('c')), (u('C'), u('d'))]) assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values)
def test_to_html_regression_GH6098(self): df = DataFrame({ u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')], 'données1': np.random.randn(5), 'données2': np.random.randn(5)}) # it works df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
def test_pivot_table_nocols(self): df = DataFrame({"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}) rs = df.pivot_table(columns="cols", aggfunc=np.sum) xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T tm.assert_frame_equal(rs, xp)
def test_to_html_regression_GH6098(): df = DataFrame({ 'clé1': ['a', 'a', 'b', 'b', 'a'], 'clé2': ['1er', '2ème', '1er', '2ème', '1er'], 'données1': np.random.randn(5), 'données2': np.random.randn(5)}) # it works df.pivot_table(index=['clé1'], columns=['clé2'])._repr_html_()
def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], 'values': [1,2,3]}) rs = df.pivot_table(columns='cols', aggfunc=np.sum) xp = df.pivot_table(index='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp)
def get_heatmap_data(ans: pd.DataFrame): """Sorts the values for the heatmap to make it more intuitive and readable Expects a dataframe `ans` containing processed (clean) barcode read counts data Expects a dataframe `flag`""" # generate heatmap matrix of (logged) read counts per sample per paired read hmap = (ans.pivot_table(index=["sample"], columns=["paired_read"], values="paired_read_count").replace( [np.inf, -np.inf], np.nan).fillna(0)) # drop unknown-unknown reads only hmap = hmap.drop(columns='unknown-unknown') # sort values to make plot more intuitive hmap['max_idx'] = hmap.apply( lambda x: hmap.columns.tolist().index(x.idxmax()), axis=1) hmap = hmap.sort_values('max_idx').drop(columns=['max_idx']) # grab read counts counts = hmap.values # prepare data for identifying potential contaminants flag = (ans.groupby('sample').agg( uniq_forward_bcodes=('forward_barcode', get_unique_barcodes), uniq_reverse_bcodes=('reverse_barcode', get_unique_barcodes))) # create boolean column that identifies potential contamination flag['contamination'] = flag.apply(is_contaminant, axis=1) # merge with original data to include the contamination flags contaminants_flag = (hmap.join(flag, how='inner')['contamination'].apply( lambda x: np.where(x == True, 1.0, 0.0))) # add contaminant flag column to the read counts data = np.hstack((counts, contaminants_flag[:, np.newaxis])) # list of all sample IDs x = hmap.index.values # list of all paired reads and an extra flag column for contamination y = hmap.columns.tolist() return hmap, data, x, y
def pivot_party_votes_df(df: pd.DataFrame): return df.pivot_table( index=["Fraktion/Gruppe", "date", "title"], columns="vote", values="fraction", fill_value=0, ).reset_index(level="Fraktion/Gruppe")
def barplot(df: pd.DataFrame, column: str, show: bool, save_location: str): plt.figure(figsize=(20, 20)) df = df.pivot_table(index=[column], aggfunc="size").sort_values() sns.barplot(x=df.index, y=df.values) plt.savefig(save_location) if show: plt.show()
def upload_data_long_format_as_single_data_set( self, data: pd.DataFrame, name: str, cross_section_column_name: str, date_column_name: str, replace_missing_values: bool = True, forward_fill_missing_values: bool = False, ) -> IndividualDataset: """ Uploads long format data into Horizon. The data frame should have a date column, with a numeric index. :param data: The dataset in a pandas data frame. Must have a valid date column. :param name: Name of the data set to be uploaded :param cross_section_column_name: The identifier column that groups the records :param date_column_name: The column name of the date index. :param forward_fill_missing_values: Forward-fill missing values :param replace_missing_values: Replace missing values :return: A summary of the uploaded data set. :param encode_categorical_data: Categorically encode data that is non-numeric :param max_categories: Maximum number of categories per series. """ df = data.pivot_table(columns=cross_section_column_name, index=date_column_name) df.reset_index(inplace=True) df.columns = ["/".join(column) for column in df.columns] return self.upload_data( data=df, name=name, forward_fill_missing_values=forward_fill_missing_values, replace_missing_values=replace_missing_values, )
def __pivot_table(self, data: pd.DataFrame) -> pd.DataFrame: """ Pivot the thc result by session/topic in rows and team in columns. """ try: pivot_table = data.pivot_table( index=["session", "topic"], columns=["team_name"], values="result", aggfunc=lambda x: x, ) except AttributeError as e: logging.error( f"Error creating pivot table for THC result. The error was {e}." ) pivot_table = pd.DataFrame() except ValueError as e: logging.error( "Error creating pivot table for THC result." f"Possibly the database is inconsistent. Error was {e}.") # TODO: This raises a ValueError: Function does not reduce in case # the database has non-unique entries. Can we recover from this? pivot_table = pd.DataFrame() else: pivot_table.fillna(THCResult.NoResult, inplace=True) return pivot_table
def getPivotMedian(dataFrame:pandas.DataFrame,valueColumns,indexColumns): """ Return pivot meadian dataframe of source dataframe """ if dataFrame is None: return None return dataFrame.pivot_table(values=valueColumns,index=indexColumns,aggfunc=numpy.median)
def test_type_error_multiindex(self): # See gh-12218 df = DataFrame( columns=["i", "c", "x", "y"], data=[[0, 0, 1, 2], [1, 0, 3, 4], [0, 1, 1, 2], [1, 1, 3, 4]], ) dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) # TODO: Is this test for pivot_table? with pytest.raises(TypeError, match="unhashable type"): dg[:, 0] index = Index(range(2), name="i") columns = MultiIndex( levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"] ) expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index) result = dg.loc[:, (slice(None), 0)] tm.assert_frame_equal(result, expected) name = ("x", 0) index = Index(range(2), name="i") expected = Series([1, 3], index=index, name=name) result = dg["x", 0] tm.assert_series_equal(result, expected)
def _reshape_df(df: pd.DataFrame) -> pd.DataFrame: """ Reshape a DataFrame from long to wide form, adhering to the following rules: - If the `meta_date` column exists, replace `variable` column with {variable}_{meta_date} and then drop `meta_date` - Construct a pivot_table where the columns come from the `variable` column, values come from the `value` column, and all other columns are used as an index """ if df.shape[0] == 0: # empty dataframe return df cols = list(df) for c in ["variable", "value"]: if c not in cols: gh_issues = "https://github.com/valorumdata/cmdc.py/issues/new" msg = (f"Column {c} not found in DataFrame. " f"Please report a bug at {gh_issues}") raise ValueError(msg) if "meta_date" in cols: if "variable" in cols: df["variable"] = (df["variable"].astype(str) + "_" + df["meta_date"].astype(str)) df.drop("meta_date", axis="columns") idx = list(set(cols) - {"variable", "value"}) return df.pivot_table(index=idx, columns="variable", values="value").reset_index()
def test_drop_multiindex_not_lexsorted(self): # GH#11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples([("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table(index="a", columns=["b", "c"], values="d") not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop("a", axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop("a", axis=1) tm.assert_frame_equal(result, expected)
def _aggregate_proj_points_data(self, df_score: pd.DataFrame) -> pd.DataFrame: """ Returns the Projected Points data aggregated to the Season/League/Week level """ df_score = df_score.copy() df_score['starter_ind'] = np.where(df_score['Pos'] == 'Bench', 0, 1) # Including the starter indiciator in order to include starter and bench points groupby_vars = [ 'season_id', 'league_id', 'Week', 'Team', 'starter_ind' ] sum_vars = ['Proj', 'Actual'] df_score = df_score.groupby(groupby_vars, as_index=False)[sum_vars].sum() id_vars = ['season_id', 'league_id', 'Week', 'Team'] df_score = df_score.pivot_table(index=id_vars, columns='starter_ind', aggfunc=sum, fill_value=0) # Pivot table causes multi-dimensional column names and id_vars become the index df_score.columns = [ '{}_{}'.format(x[0], 'Starter') if x[1] == 1 else '{}_{}'.format( x[0], 'Bench') for x in df_score.columns ] df_score = df_score.reset_index().rename_axis(None, axis=1) final_df = df_score return final_df
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples([('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table(index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def _plot_and_save( df: pd.DataFrame, index: str, column: str, index_label: str, column_label: str, values: str = 'ret', add_sep_colorbar: bool = True, norm: colors.Normalize = None, save_figure: bool = False, save_dir: str = None ): if index in df.columns and column in df.columns: # Pivot table (by default averages over identical index / columns cells) df_pivot = df.pivot_table(index=index, columns=column, values=values) # Generate the plot fig_hm, fig_cb = render_heatmap(df_pivot, add_sep_colorbar=add_sep_colorbar, norm=norm, y_label=index_label, x_label=column_label) # Save heat map and color bar if desired if save_figure: name = '-'.join([index, column]) fig_hm.savefig(osp.join(save_dir, f'hm-{name}.pdf')) if fig_cb is not None: fig_cb.savefig(osp.join(save_dir, f'cb-{name}.pdf'))
def Process(n_states, filepath): rdr = csv.reader(open(filepath), delimiter=',') datacols = defaultdict(list) for ag, ev, ob, tr in rdr: datacols['agents'].append(int(ag)) datacols['events'].append(int(ev)) datacols['observations'].append(int(ob)) datacols['truths'].append(int(tr)) df = DataFrame(datacols) MV_result = [] for i in range(0, len(np.unique(df['events']))): mcount = df[df['events'] == i][['agents', 'observations' ]].groupby('observations').count() MV_result.append(mcount['agents'].idxmax()) df2 = df.pivot_table(index='agents', columns='events', values='observations') #df3 = df2.replace(0,n_states).fillna(0) #(0,6) #MV_result=[x if x!=0 else n_states for x in MV_result] # x if x!=0 else 6 GroundTruth = df[['events', 'truths']].drop_duplicates().sort_values( ['events']) #.replace(0,n_states) return df2, MV_result, GroundTruth
def test_pivot_table_dropna(self): df = DataFrame( { "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000}, "customer": {0: "A", 1: "A", 2: "B", 3: "C"}, "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310}, "product": {0: "a", 1: "b", 2: "c", 3: "d"}, "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}, } ) pv_col = df.pivot_table("quantity", "month", ["customer", "product"], dropna=False) pv_ind = df.pivot_table("quantity", ["customer", "product"], "month", dropna=False) m = MultiIndex.from_tuples( [ ("A", "a"), ("A", "b"), ("A", "c"), ("A", "d"), ("B", "a"), ("B", "b"), ("B", "c"), ("B", "d"), ("C", "a"), ("C", "b"), ("C", "c"), ("C", "d"), ], names=["customer", "product"], ) tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m)
def prepareBreakagebreakageSummary(breakageData, stlSalesSamePeriod, kcSalesSamePeriod, reportYear, lastYear): ''' Takes in clean data and gets it ready for consumption ''' aggFuncs = { 'Breakage|Dollars' : np.sum, 'Breakage|Cases' : np.sum } groupCols = ['Warehouse','ReasonCode','Year'] breakageSummary = DataFrame(breakageData.groupby(groupCols).agg(aggFuncs).reset_index(drop=False)) breakageSummary = pd.DataFrame(breakageSummary.pivot_table(values=['Breakage|Cases','Breakage|Dollars'], index=['Warehouse','ReasonCode'], columns=['Year'])) breakageSummary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in breakageSummary.columns] breakageSummary.sort_index(inplace=True, ascending=False) breakageSummary['Breakage|% Sales'] = breakageSummary.index.get_level_values(0) breakageSummary['Breakage|% Sales'] = breakageSummary['Breakage|% Sales'].map({'Kansas City':kcSalesSamePeriod, 'Saint Louis':stlSalesSamePeriod}) breakageSummary['Breakage|% Sales'] = np.divide(breakageSummary['Breakage|Dollars|2016'], breakageSummary['Breakage|% Sales']) def yoy_delta(now, then): return np.divide(np.subtract(now,then), then) breakageSummary['Breakage|Dollars|% Change'] = round(yoy_delta(breakageSummary['Breakage|Dollars|'+str(reportYear)], breakageSummary['Breakage|Dollars|'+str(lastYear)]),4) breakageSummary['Breakage|Cases|% Change'] = round(yoy_delta(breakageSummary['Breakage|Cases|'+str(reportYear)], breakageSummary['Breakage|Cases|'+str(lastYear)]),4) breakageSummary = breakageSummary.reindex(columns=['Breakage|Dollars|'+str(lastYear), 'Breakage|Dollars|'+str(reportYear), 'Breakage|Dollars|% Change', 'Breakage|% Sales', 'Breakage|Cases|'+str(lastYear), 'Breakage|Cases|'+str(reportYear), 'Breakage|Cases|% Change']) breakageSummary = breakageSummary.reindex(index=['Warehouse Breakage','Cross-Dock Breakage','Driver Breakage','Supplier Breakage','Sales Breakage & Unsaleables'], level='ReasonCode') return breakageSummary
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def check_count(df: pd.DataFrame): df_group_by = df.pivot_table(values='SITE', index='Time', aggfunc=pd.Series.nunique) df_count_check = df_group_by[df_group_by['SITE'] <= 10900] return df_count_check
def test_pivot_table_dropna(self): df = DataFrame({ 'amount': { 0: 60000, 1: 100000, 2: 50000, 3: 30000 }, 'customer': { 0: 'A', 1: 'A', 2: 'B', 3: 'C' }, 'month': { 0: 201307, 1: 201309, 2: 201308, 3: 201310 }, 'product': { 0: 'a', 1: 'b', 2: 'c', 3: 'd' }, 'quantity': { 0: 2000000, 1: 500000, 2: 1000000, 3: 1000000 } }) pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'), ('A', 'd'), ('B', 'a'), ('B', 'b'), ('B', 'c'), ('B', 'd'), ('C', 'a'), ('C', 'b'), ('C', 'c'), ('C', 'd')]) assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values)
def plot_hyper_comparison(all_dfs: pd.DataFrame, hyper_param: str, ys: List[str], x: str='iteration', *args, **kwargs): compare_df = all_dfs.pivot_table(index=x, columns=[hyper_param], values=ys, aggfunc='first').reset_index() compare_df.columns = compare_df.columns.to_flat_index() hyper_values = all_dfs[hyper_param].unique() return plot_fields(compare_df, x=(x, ''), ys=[(field, hyper_value) for hyper_value in hyper_values for field in ys], *args, **kwargs)
def test_to_html_regression_GH6098(self): df = DataFrame({ u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')], 'données1': np.random.randn(5), 'données2': np.random.randn(5) }) # it works df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
class PivotTable(object): def setup(self): N = 100000 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') ind1 = np.random.randint(0, 3, size=N) ind2 = np.random.randint(0, 2, size=N) self.df = DataFrame({'key1': fac1.take(ind1), 'key2': fac2.take(ind2), 'key3': fac2.take(ind2), 'value1': np.random.randn(N), 'value2': np.random.randn(N), 'value3': np.random.randn(N)}) def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3'])
def get_runs_counts_by_match(): new_data = DataFrame(ipl_df[['match_code', 'runs']].groupby(['match_code', 'runs'])['runs'].count()) new_data.columns = ['Total_runs'] return new_data.pivot_table(index='match_code', columns='runs', values='Total_runs')
def create_data_table(df: pd.DataFrame) -> pd.DataFrame: """Creates a data table containing all of the input dataframe's data in a format appropriate for sharing with others Return the data in the input dataframe, dropping some columns only used internally and pivoting to a wide format w.r.t. case types :param df: Dataframe containing all locations' data in long format :type df: pd.DataFrame :return: Dataframe containing all locations' data in wide format, with some auxiliary columns dropped :rtype: pd.DataFrame """ df = df.copy() # Normalize times by labeling all of today's data with its future label, 00:00 # tomorrow (as that's the timestamp marking the end of the 24-hour data collection # period). No need to adjust data not from today; it's already been adjusted and is # labeled with the date whose 00:00 marked the end of data collection (i.e., data # generated on Mar 20 is labeled Mar 21). normalized_dates = df[Columns.DATE].dt.normalize() is_at_midnight = df[Columns.DATE] == normalized_dates df.loc[~is_at_midnight, Columns.DATE] = normalized_dates[~is_at_midnight] + pd.Timedelta( days=1) df[Columns.DATE] = df[Columns.DATE].dt.strftime(r"%Y-%m-%d") df = df.drop(columns=[ Columns.IS_STATE, Columns.LOCATION_NAME, Columns.OUTBREAK_START_DATE_COL, Columns.DAYS_SINCE_OUTBREAK, Columns.POPULATION, Columns.STAGE, Columns.COUNT_TYPE, ]) df = (df.pivot_table( index=[ c for c in df.columns if c not in [Columns.CASE_TYPE, Columns.CASE_COUNT] ], columns=Columns.CASE_TYPE, values=Columns.CASE_COUNT, aggfunc="first", ).reset_index().sort_values([Columns.COUNTRY, Columns.STATE, Columns.DATE])) for col in CaseInfo.get_info_items_for(InfoField.CASE_TYPE, count=Counting.TOTAL_CASES): df[col] = pd.to_numeric(df[col], downcast="integer") # save_path = Paths.DATA / "data_table.csv" # df.to_csv(save_path, index=False) # print(f"Saved data to {save_path.relative_to(Paths.ROOT)}") return df
def pivot_categorical(data: pd.DataFrame) -> pd.DataFrame: """Pivots data that is long on categories to be wide.""" key_cols = ['sex', 'age_start', 'age_end', 'year_start', 'year_end'] key_cols = [k for k in key_cols if k in data.columns] data = data.pivot_table(index=key_cols, columns='parameter', values='value').reset_index() data.columns.name = None return data
def arruma_votos(): votos = read_csv("voto_secao_partido.csv") partidos = [13,45,15,55] votos = votos[votos.partido.isin(partidos)] votos["id"] = votos.apply(lambda t:str(t["num_zona"])+"--"+str(t["num_secao"]),axis=1) votos = DataFrame(votos.pivot_table(index=votos["id"], columns="partido",values="sum(votos)",aggfunc=np.sum)) votos.columns = ["PT","PMDB","PSDB","PSD"] votos.to_csv("voto_secao_partido_trabalhada.csv") return votos
def execute(self, df: DataFrame, domain_retriever=None, execute_pipeline=None) -> DataFrame: pivoted_df = df.pivot_table( values=self.value_column, index=self.index, columns=self.column_to_pivot, aggfunc='mean' if self.agg_function == 'avg' else self.agg_function, ).reset_index() pivoted_df.columns.name = None return pivoted_df
def curr_test_ts_states(daily: pd.DataFrame) -> pd.DataFrame: """ This function is to read the current testing from https://covidtracking.com/api/ and return the pandas dataframe """ states_total_by_dates = daily.pivot_table(index="date", columns="state", values="totalTestResults") return states_total_by_dates
class PivotTable(object): def setup(self): N = 100000 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') ind1 = np.random.randint(0, 3, size=N) ind2 = np.random.randint(0, 2, size=N) self.df = DataFrame({ 'key1': fac1.take(ind1), 'key2': fac2.take(ind2), 'key3': fac2.take(ind2), 'value1': np.random.randn(N), 'value2': np.random.randn(N), 'value3': np.random.randn(N) }) def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3'])
def get_pivoted_data(data, population_data): columns = ["value", "date", "areaCode", "areaType", "areaName", "category"] dt_final = DataFrame(columns=columns) # Because of the hierarchical nature of the original data, there is # no easy way to automate this process using a generic solution # without prolonging the execution time. The iterative method appears # to produce the optimal time. for area_type in data: if area_type not in CATEGORY_LABELS: continue logging.info(f"\t\tArea type: {area_type}") dt_label = extract_category_data(data, columns, area_type, population_data) dt_final = dt_final.append(dt_label) # Reset index to appear incrementally. dt_final.reset_index(inplace=True) dt_final = dt_final.loc[:, columns] logging.info(">> Data was processed and converted into a categorical table") # Convert date strings to timestamp objects (needed for sorting). dt_final[DATE_COLUMN] = to_datetime(dt_final[DATE_COLUMN]) logging.info(">> Dates were converted to datetime object") dt_pivot = dt_final.pivot_table( values='value', index=["areaType", "date", "areaName", "areaCode"], columns=['category'], aggfunc=lambda x: x.max() ) logging.info(">> Pivot table created") dt_pivot.sort_values( ["date", "areaName"], ascending=[False, True], inplace=True ) dt_pivot.reset_index(inplace=True) logging.info(">> Data table was sorted by date and areaName") # Change column names. dt_pivot.columns = [ "areaType", "date", "areaName", "areaCode", *dt_pivot.columns[4:] ] logging.info(">> New column names were set") return dt_pivot
def make_pivot_table(data: pd.DataFrame) -> pd.DataFrame: values = get_names(STATISTICS) report = data.pivot_table(index=get_names(PARAMETERS), values=values, margins=True, margins_name='Avg', aggfunc=np.mean) return report.reindex(values, axis=1)
def plot_innings_runs_histogram(): ipl_df = pd.read_csv('data/ipl_dataset.csv', index_col=None) df = DataFrame(ipl_df[['batting_team', 'inning', 'runs']].groupby(['batting_team', 'inning'])['runs'].sum()) piv_tab = df.pivot_table(index='batting_team', columns='inning', values='runs') piv_tab.plot(kind='bar', stacked=True) plt.show()
def pivot_categories(df_extract_categories: pd.DataFrame) -> pd.DataFrame: """ :param df_extract_categories: DataFrame with categories from function extract_categories() :return: DataFrame where if business_id has the categorie the value will be 1 """ return df_extract_categories.pivot_table(index='business_id', columns='category', aggfunc='size', fill_value=0)
def plot_innings_runs_histogram(): df = DataFrame(ipl_df[['inning', 'runs', 'batting_team']].groupby(['batting_team', 'inning'])['runs'].sum()) table = df.pivot_table(index='batting_team', columns='inning', values='runs') table1 = table.sort_index() table1.plot(kind='bar', stacked=True) plt.show()
def _get_lof_features(self, ratings: pd.DataFrame): rating_mx = ratings.pivot_table(index=self.USER_ID_COL, columns=self.ITEM_ID_COL, values=self.RATING_COL).fillna(0) clf = LocalOutlierFactor(n_neighbors=30, metric='euclidean') clf.fit(rating_mx) user_lof = pd.DataFrame(clf.negative_outlier_factor_) user_lof.columns = ['LOF'] user_lof[self.USER_ID_COL] = rating_mx.index return user_lof
def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) values = np.random.randn(100) table = crosstab([a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"]) df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) expected = df.pivot_table("values", index=["foo", "bar"], columns="baz", aggfunc=np.sum) tm.assert_frame_equal(table, expected)
def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) tm.assert_frame_equal(table, table2, check_names=False)
def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16)}) table = data.pivot_table('value', index='a', columns=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') tm.assert_frame_equal(table, expected)
def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = date.min data = list( product(["foo", "bar"], ["A", "B", "C"], ["x1", "x2"], [d + timedelta(i) for i in range(20)], [1.0]) ) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) tm.assert_frame_equal(table, table2, check_names=False)
def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) values = np.random.randn(100) table = crosstab([a, b], c, values, aggfunc=np.sum, rownames=['foo', 'bar'], colnames=['baz']) df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) expected = df.pivot_table('values', index=['foo', 'bar'], columns='baz', aggfunc=np.sum) tm.assert_frame_equal(table, expected)
class PivotTable: def setup(self): N = 100000 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') ind1 = np.random.randint(0, 3, size=N) ind2 = np.random.randint(0, 2, size=N) self.df = DataFrame({'key1': fac1.take(ind1), 'key2': fac2.take(ind2), 'key3': fac2.take(ind2), 'value1': np.random.randn(N), 'value2': np.random.randn(N), 'value3': np.random.randn(N)}) self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'), 'col3': [1, 2, 3, 4, 5]}) self.df2.col1 = self.df2.col1.astype('category') self.df2.col2 = self.df2.col2.astype('category') def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) def time_pivot_table_agg(self): self.df.pivot_table(index='key1', columns=['key2', 'key3'], aggfunc=['sum', 'mean']) def time_pivot_table_margins(self): self.df.pivot_table(index='key1', columns=['key2', 'key3'], margins=True) def time_pivot_table_categorical(self): self.df2.pivot_table(index='col1', values='col3', columns='col2', aggfunc=np.sum, fill_value=0) def time_pivot_table_categorical_observed(self): self.df2.pivot_table(index='col1', values='col3', columns='col2', aggfunc=np.sum, fill_value=0, observed=True)
def test_pivot_columns_lexsorted(self): import datetime import numpy as np import pandas n = 10000 dtype = np.dtype( [ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ] ) products = np.array( [ ("SP500", "ADBE"), ("SP500", "NVDA"), ("SP500", "ORCL"), ("NDQ100", "AAPL"), ("NDQ100", "MSFT"), ("NDQ100", "GOOG"), ("FTSE", "DGE.L"), ("FTSE", "TSCO.L"), ("FTSE", "GSK.L"), ], dtype=[("Index", object), ("Symbol", object)], ) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items["Index"] = products["Index"][iproduct] items["Symbol"] = products["Symbol"][iproduct] dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items["Year"] = dates.year items["Month"] = dates.month items["Day"] = dates.day items["Price"] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table("Price", rows=["Month", "Day"], cols=["Index", "Symbol", "Year"], aggfunc="mean") self.assert_(pivoted.columns.is_monotonic)
def test_pivot_columns_lexsorted(self): import datetime import numpy as np import pandas n = 10000 dtype = np.dtype([ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ]) products = np.array([ ('SP500', 'ADBE'), ('SP500', 'NVDA'), ('SP500', 'ORCL'), ('NDQ100', 'AAPL'), ('NDQ100', 'MSFT'), ('NDQ100', 'GOOG'), ('FTSE', 'DGE.L'), ('FTSE', 'TSCO.L'), ('FTSE', 'GSK.L'), ], dtype=[('Index', object), ('Symbol', object)]) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month items['Day'] = dates.day items['Price'] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table('Price', rows=['Month', 'Day'], cols=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assert_(pivoted.columns.is_monotonic)
def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame( { "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2, "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2, "c": (["foo"] * 4 + ["bar"] * 4) * 2, "value": np.random.randn(16), } ) table = data.pivot_table("value", index="a", columns=["b", "c"]) grouped = data.groupby(["a", "b", "c"])["value"].mean() expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all") tm.assert_frame_equal(table, expected)
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples([("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table(index="a", columns=["b", "c"], values="d") not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop("a", axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop("a", axis=1) tm.assert_frame_equal(result, expected)
4.5: [9, 20, 31, 42, 53, 64, 75], 5.0: [10, 21, 32, 43, 54, 65, 76]} grouped_MaxResponse_f.groups.keys() -> dict_keys([0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 3.5, 1.5, 4.5, 2.5]) grouped_MaxResponse_f.groups.values() -> #列の値をもとに演算を施し、その演算結果を列として追加する->applyを用いる AAFremoved["p_peak_appeared_repaired"] = AAFremoved["p_remark"].apply(lambda data: isinstance(data, str)) | AAFremoved["p_peak_appeared"] AAFremoved["Max Response (%)"] = AAFremoved["p_max"].apply(lambda F: F if F>0 else 0) #ピボットテーブル data.pivot_table() #クロス集計 data.crosstab() #縦に連結 pd.concat([A,B]) A.append(B) #横に連結 pd.concat([A, B], axis=1) pd.merge(left, right, left_index=True, right_index=True) A.join(B) #結合 pd.merge(left, right, on='key', how="outer")
class TestPivotTable(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) def test_pivot_table(self): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, values='D', index=index, columns=columns) table2 = self.data.pivot_table(values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', index=index) if len(index) > 1: self.assertEqual(table.index.names, tuple(index)) else: self.assertEqual(table.index.name, index[0]) if len(columns) > 1: self.assertEqual(table.columns.names, columns) else: self.assertEqual(table.columns.name, columns[0]) expected = self.data.groupby(index + [columns])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_warnings(self): index = ['A', 'B'] columns = 'C' with tm.assert_produces_warning(FutureWarning): table = pivot_table(self.data, values='D', rows=index, cols=columns) with tm.assert_produces_warning(False): table2 = pivot_table(self.data, values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], 'values': [1,2,3]}) rs = df.pivot_table(columns='cols', aggfunc=np.sum) xp = df.pivot_table(index='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) m = MultiIndex.from_tuples([(u('A'), u('a')), (u('A'), u('b')), (u('A'), u('c')), (u('A'), u('d')), (u('B'), u('a')), (u('B'), u('b')), (u('B'), u('c')), (u('B'), u('d')), (u('C'), u('a')), (u('C'), u('b')), (u('C'), u('c')), (u('C'), u('d'))]) assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values) def test_pass_array(self): result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C) expected = self.data.pivot_table('D', index='A', columns='C') tm.assert_frame_equal(result, expected) def test_pass_function(self): result = self.data.pivot_table('D', index=lambda x: x // 5, columns=self.data.C) expected = self.data.pivot_table('D', index=self.data.index // 5, columns='C') tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, index=index, columns=columns) expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'int64') z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64 = 2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'float64') z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64 = 2)) tm.assert_series_equal(result, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], index='A', columns=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), index='A', columns=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) result = df.pivot('a','b','c') expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], [nan,nan,nan,nan],[nan,nan,15,20]], index = Index(['R1','R2',nan,'R4'],name='a'), columns = Index(['C1','C2','C3','C4'],name='b')) tm.assert_frame_equal(result, expected) def test_pivot_with_tz(self): # GH 5878 df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0), datetime.datetime(2013, 1, 2, 9, 0), datetime.datetime(2013, 1, 1, 9, 0), datetime.datetime(2013, 1, 2, 9, 0)], 'dt2': [datetime.datetime(2014, 1, 1, 9, 0), datetime.datetime(2014, 1, 1, 9, 0), datetime.datetime(2014, 1, 2, 9, 0), datetime.datetime(2014, 1, 2, 9, 0)], 'data1': np.arange(4,dtype='int64'), 'data2': np.arange(4,dtype='int64')}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, name='dt2', tz='Asia/Tokyo') exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=exp_col) pv = df.pivot(index='dt1', columns='dt2') tm.assert_frame_equal(pv, expected) expected = DataFrame([[0, 2], [1, 3]], index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], name='dt1', tz='US/Pacific'), columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], name='dt2', tz='Asia/Tokyo')) pv = df.pivot(index='dt1', columns='dt2', values='data1') tm.assert_frame_equal(pv, expected) def test_margins(self): def _check_output(res, col, index=['A', 'B'], columns=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(index)[col].mean() tm.assert_series_equal(cmarg, exp) res = res.sortlevel() rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(columns)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, aggfunc=np.mean) tm.assert_isinstance(rtable, Series) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16)}) table = data.pivot_table('value', index='a', columns=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): n = 10000 dtype = np.dtype([ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ]) products = np.array([ ('SP500', 'ADBE'), ('SP500', 'NVDA'), ('SP500', 'ORCL'), ('NDQ100', 'AAPL'), ('NDQ100', 'MSFT'), ('NDQ100', 'GOOG'), ('FTSE', 'DGE.L'), ('FTSE', 'TSCO.L'), ('FTSE', 'GSK.L'), ], dtype=[('Index', object), ('Symbol', object)]) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] dr = pd.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month items['Day'] = dates.day items['Price'] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table('Price', index=['Month', 'Day'], columns=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assertTrue(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): f = {'D': ['std'], 'E': ['sum']} expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(index='A', columns='B', aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. result = self.data[['A', 'B']].pivot_table(index=['A', 'B'], aggfunc=len, margins=True) result_list = result.tolist() self.assertEqual(sum(result_list[:-1]), result_list[-1]) def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a multi-index result = self.data[['A', 'B', 'C']].pivot_table(index=['A', 'B'], columns='C', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col defined result = self.data[['A', 'B']].pivot_table(index='A', columns='B', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0]) def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols are multi-indexed self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_pivot_timegrouper(self): df = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date') expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3), index=[datetime.datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result,expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result,expected.T) expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1,3,5,1,8,1,9,3], 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0), datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)], 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0), datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]}) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4), index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)]) expected.index.name = 'Date' expected.columns.name = 'PayDay' tm.assert_frame_equal(result, expected) result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), columns=Grouper(freq='M', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' result = pivot_table(df, index=[Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay')], columns=['Branch'], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay')], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': range(6), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) tm.assert_frame_equal(result, expected) exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_margins(self): def _check_output(res, col, index=['A', 'B'], columns=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(index)[col].mean() tm.assert_series_equal(cmarg, exp) res = res.sortlevel() rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(columns)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, aggfunc=np.mean) tm.assert_isinstance(rtable, Series) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) # issue number #8349: pivot_table with margins and dictionary aggfunc df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ]) df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False) rs=df.pivot_table( index=['JOB','NAME'], columns=['YEAR','MONTH'], values=['DAYS','SALARY'], aggfunc={'DAYS':'mean','SALARY':'sum'}, margins=True) ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True) tm.assert_frame_equal(rs['DAYS'], ex['DAYS']) ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True) tm.assert_frame_equal(rs['SALARY'], ex['SALARY'])
class TestPivotTable(unittest.TestCase): def setUp(self): self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B' : ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C' : ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D' : np.random.randn(11), 'E' : np.random.randn(11), 'F' : np.random.randn(11)}) def test_pivot_table(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pass_array(self): result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) expected = self.data.pivot_table('D', rows='A', cols='C') tm.assert_frame_equal(result, expected) def test_pass_function(self): result = self.data.pivot_table('D', rows=lambda x: x // 5, cols=self.data.C) expected = self.data.pivot_table('D', rows=self.data.index // 5, cols='C') tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], rows='A', cols=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), rows='A', cols=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) def test_margins(self): def _check_output(res, col, rows=['A', 'B'], cols=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(rows)[col].mean() tm.assert_series_equal(cmarg, exp) rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(cols)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, aggfunc=np.mean) self.assert_(isinstance(rtable, Series)) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) def test_pivot_integer_columns(self): # caused by upstream bug in unstack from pandas.util.compat import product import datetime import pandas d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in xrange(20)], [1.0])) df = pandas.DataFrame(data) table = df.pivot_table(values=4, rows=[0,1,3],cols=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2']) tm.assert_frame_equal(table, table2) def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16)}) table = data.pivot_table('value', rows='a', cols=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): import datetime import numpy as np import pandas n = 10000 dtype = np.dtype([ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ]) products = np.array([ ('SP500', 'ADBE'), ('SP500', 'NVDA'), ('SP500', 'ORCL'), ('NDQ100', 'AAPL'), ('NDQ100', 'MSFT'), ('NDQ100', 'GOOG'), ('FTSE', 'DGE.L'), ('FTSE', 'TSCO.L'), ('FTSE', 'GSK.L'), ], dtype=[('Index', object), ('Symbol', object)]) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month items['Day'] = dates.day items['Price'] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table('Price', rows=['Month', 'Day'], cols=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assert_(pivoted.columns.is_monotonic)
class TestPivotTable(unittest.TestCase): def setUp(self): self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B' : ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C' : ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D' : np.random.randn(11), 'E' : np.random.randn(11), 'F' : np.random.randn(11)}) def test_pivot_table(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_multiple(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], rows='A', cols=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), rows='A', cols=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) def test_margins(self): def _check_output(res, col, rows=['A', 'B'], cols=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(rows)[col].mean() tm.assert_series_equal(cmarg, exp) rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(cols)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, aggfunc=np.mean) self.assert_(isinstance(rtable, Series)) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) def test_pivot_integer_columns(self): # caused by upstream bug in unstack from pandas.util.compat import product import datetime import pandas d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in xrange(20)], [1.0])) df = pandas.DataFrame(data) table = df.pivot_table(values=4, rows=[0,1,3],cols=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2']) tm.assert_frame_equal(table, table2)
class TestPivotTable(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) def test_pivot_table(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, tuple(rows)) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_nocols(self): df = DataFrame({'rows': ['a', 'b', 'c'], 'cols': ['x', 'y', 'z'], 'values': [1,2,3]}) rs = df.pivot_table(cols='cols', aggfunc=np.sum) xp = df.pivot_table(rows='cols', aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'}) xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) m = MultiIndex.from_tuples([(u('A'), u('a')), (u('A'), u('b')), (u('A'), u('c')), (u('A'), u('d')), (u('B'), u('a')), (u('B'), u('b')), (u('B'), u('c')), (u('B'), u('d')), (u('C'), u('a')), (u('C'), u('b')), (u('C'), u('c')), (u('C'), u('d'))]) assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values) def test_pass_array(self): result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) expected = self.data.pivot_table('D', rows='A', cols='C') tm.assert_frame_equal(result, expected) def test_pass_function(self): result = self.data.pivot_table('D', rows=lambda x: x // 5, cols=self.data.C) expected = self.data.pivot_table('D', rows=self.data.index // 5, cols='C') tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) self.assert_(f.dtypes['v'] == 'int64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64 = 2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) self.assert_(f.dtypes['v'] == 'float64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64 = 2)) tm.assert_series_equal(result, expected) def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], rows='A', cols=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), rows='A', cols=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) result = df.pivot('a','b','c') expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], [nan,nan,nan,nan],[nan,nan,15,20]], index = Index(['R1','R2',nan,'R4'],name='a'), columns = Index(['C1','C2','C3','C4'],name='b')) tm.assert_frame_equal(result, expected) def test_margins(self): def _check_output(res, col, rows=['A', 'B'], cols=['C']): cmarg = res['All'][:-1] exp = self.data.groupby(rows)[col].mean() tm.assert_series_equal(cmarg, exp) res.sortlevel(inplace=True) rmarg = res.xs(('All', ''))[:-1] exp = self.data.groupby(cols)[col].mean() tm.assert_series_equal(rmarg, exp) gmarg = res['All']['All', ''] exp = self.data[col].mean() self.assertEqual(gmarg, exp) # column specified table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) _check_output(table, 'D') # no column specified table = self.data.pivot_table(rows=['A', 'B'], cols='C', margins=True, aggfunc=np.mean) for valcol in table.columns.levels[0]: _check_output(table[valcol], valcol) # no col # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc=np.mean) for valcol in table.columns: gmarg = table[valcol]['All', ''] self.assertEqual(gmarg, self.data[valcol].mean()) # this is OK table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, aggfunc='mean') # no rows rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, aggfunc=np.mean) tm.assert_isinstance(rtable, Series) for item in ['DD', 'EE', 'FF']: gmarg = table[item]['All', ''] self.assertEqual(gmarg, self.data[item].mean()) def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) df = pandas.DataFrame(data) table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', rows=['0', '1', '3'], cols=['2']) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, 'c': (['foo'] * 4 + ['bar'] * 4) * 2, 'value': np.random.randn(16)}) table = data.pivot_table('value', rows='a', cols=['b', 'c']) grouped = data.groupby(['a', 'b', 'c'])['value'].mean() expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): n = 10000 dtype = np.dtype([ ("Index", object), ("Symbol", object), ("Year", int), ("Month", int), ("Day", int), ("Quantity", int), ("Price", float), ]) products = np.array([ ('SP500', 'ADBE'), ('SP500', 'NVDA'), ('SP500', 'ORCL'), ('NDQ100', 'AAPL'), ('NDQ100', 'MSFT'), ('NDQ100', 'GOOG'), ('FTSE', 'DGE.L'), ('FTSE', 'TSCO.L'), ('FTSE', 'GSK.L'), ], dtype=[('Index', object), ('Symbol', object)]) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month items['Day'] = dates.day items['Price'] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) pivoted = df.pivot_table('Price', rows=['Month', 'Day'], cols=['Index', 'Symbol', 'Year'], aggfunc='mean') self.assert_(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): f = {'D': ['std'], 'E': ['sum']} expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(rows='A', cols='B', aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. result = self.data[['A', 'B']].pivot_table(rows=['A', 'B'], aggfunc=len, margins=True) result_list = result.tolist() self.assertEqual(sum(result_list[:-1]), result_list[-1]) def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a multi-index result = self.data[['A', 'B', 'C']].pivot_table(rows=['A', 'B'], cols='C', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col defined result = self.data[['A', 'B']].pivot_table(rows='A', cols='B', aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0]) def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols are multi-indexed self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] result = self.data[['A', 'B', 'C', 'D']].pivot_table(rows=['A', 'B'], cols=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. .. versionadded:: 0.21.0 dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", ... "bar", "bar", "foo", "foo", "foo"], dtype=object) >>> b = np.array(["one", "one", "one", "two", "one", "one", ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) ... # doctest: +NORMALIZE_WHITESPACE b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, # and will not be shown in the output because # dropna is True by default. Set 'dropna=False' # to preserve categories with no data ... # doctest: +SKIP col_0 d e row_0 a 1 0 b 0 1 >>> crosstab(foo, bar, dropna=False) # 'c' and 'f' are not represented # in the data, but they still will be counted # and shown in the output ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame df = DataFrame(data, index=common_idx) if values is None: df['__dummy__'] = 0 kwargs = {'aggfunc': len, 'fill_value': 0} else: df['__dummy__'] = values kwargs = {'aggfunc': aggfunc} table = df.pivot_table('__dummy__', index=rownames, columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, **kwargs) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, margins_name=margins_name) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, # but they still will be counted in the output col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) table = table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, **kwarg): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors aggfunc : function, optional If no values array is passed, computes a frequency table rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Returns ------- crosstab : DataFrame """ # Parse old-style keyword arguments rows = kwarg.pop('rows', None) if rows is not None: warnings.warn("rows is deprecated, use index", FutureWarning) if index is None: index = rows else: msg = "Can only specify either 'rows' or 'index'" raise TypeError(msg) cols = kwarg.pop('cols', None) if cols is not None: warnings.warn("cols is deprecated, use columns", FutureWarning) if columns is None: columns = cols else: msg = "Can only specify either 'cols' or 'columns'" raise TypeError(msg) if kwarg: raise TypeError("Unexpected argument(s): %s" % kwarg.keys()) index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors aggfunc : function, optional If no values array is passed, computes a frequency table rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) return table