def all_Month_By_Cost(mydf): print( '\n---------------------------------------开始从2017年开始,By 季度 ' '以Cost的方式计算所有数据,---------------------------------------------------------------\n' ) mydf = mydf.merge(df_Cost) # 添加Total Cost这一列,Total Cost 等于每一行的物料号乘以数量 mydf['Total Cost'] = mydf.iloc[:, 6] * mydf.iloc[0:, 13] mydf.loc[:, 'Input Date'] = pd.to_datetime(mydf.loc[:, 'Input Date']) # 每个季度,SPL的Summary df_ByOrderTime_All_From2007_ByQ = \ mydf.groupby([Grouper(key='Input Date', freq='BQ'), 'Project Name'])[ 'Total Cost'].sum().rename('合计Cost').reset_index() # 每个SPL,在每个季度的Summary df_ByOrderTime_All_From2007_BySPL = \ mydf.groupby(['Project Name', Grouper(key='Input Date', freq='BQ')])[ 'Total Cost'].sum().rename('合计Cost').reset_index() df_ByOrderTime_All_From2007_Total = mydf.groupby([ Grouper(key='Input Date', freq='BQ') ])['Total Cost'].sum().rename('合计Cost').reset_index() return ( df_ByOrderTime_All_From2007_ByQ, df_ByOrderTime_All_From2007_BySPL, df_ByOrderTime_All_From2007_Total, )
def split_by_date(self, mode='day'): """ Split trajectory into subtrajectories using regular time intervals. Parameters ---------- mode : str Split mode Returns ------- list List of trajectories """ result = [] if mode == 'day': grouped = self.df.groupby(Grouper(freq="D")) elif mode == 'month': grouped = self.df.groupby(Grouper(freq="M")) elif mode == 'year': grouped = self.df.groupby(Grouper(freq="Y")) else: raise ValueError( 'Invalid split mode {}. Must be one of [day, month, year]'. format(mode)) for key, values in grouped: if len(values) > 1: result.append(Trajectory(values, '{}_{}'.format(self.id, key))) return result
def create_boxplots_plots(csv_file_name): # load data for 70% - 30% series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_70_30.csv', header=0, index_col=0, parse_dates=True, squeeze=True) groups = series.groupby(Grouper(freq='A')) years = DataFrame() appended_data = [] for name, group in groups: years[name.year] = group.values appended_data.append(years) years = DataFrame() appended_data = concat(appended_data) appended_data.boxplot() pyplot.show() # load data for 80% - 20% series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_80_20.csv', header=0, index_col=0, parse_dates=True, squeeze=True) groups = series.groupby(Grouper(freq='A')) years = DataFrame() appended_data = [] for name, group in groups: years[name.year] = group.values appended_data.append(years) years = DataFrame() appended_data = concat(appended_data) appended_data.boxplot() pyplot.show() # load data for 90% - 10% series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_90_10.csv', header=0, index_col=0, parse_dates=True, squeeze=True) groups = series.groupby(Grouper(freq='A')) years = DataFrame() appended_data = [] for name, group in groups: years[name.year] = group.values appended_data.append(years) years = DataFrame() appended_data = concat(appended_data) appended_data.boxplot() pyplot.show() # load data for 95% - 5% series = read_csv('data/datasets/' + csv_file_name.split('.csv')[0] + '_dataset_95_5.csv', header=0, index_col=0, parse_dates=True, squeeze=True) groups = series.groupby(Grouper(freq='A')) years = DataFrame() appended_data = [] for name, group in groups: years[name.year] = group.values appended_data.append(years) years = DataFrame() appended_data = concat(appended_data) appended_data.boxplot() pyplot.show()
def all_Month_By_Cost_booking(mydf): print( '\n---------------------------------------以Booking时间,开始从2017年开始,By 季度 ' '以Booking时间,以Cost的方式计算所有数据,---------------------------------------------------------------\n' ) mydf = mydf.dropna(subset=["财务入账时间"]) mydf = mydf.merge(df_Cost) # 添加Total Cost这一列,Total Cost 等于每一行的物料号乘以数量 mydf['Total Cost'] = mydf.iloc[:, 6] * mydf.iloc[0:, 13] mydf.loc[:, '财务入账时间'] = pd.to_datetime(mydf.loc[:, '财务入账时间']) # 每个季度,SPL的Summary df_ByBookingTime_All_From2007_ByQ = \ mydf.groupby([Grouper(key='财务入账时间', freq='BQ'), 'Project Name'])[ 'Total Cost'].sum().rename('合计Cost').reset_index() # 每个SPL,在每个季度的Summary df_ByBookingTime_All_From2007_BySPL = \ mydf.groupby(['Project Name', Grouper(key='财务入账时间', freq='BQ')])[ 'Total Cost'].sum().rename('合计Cost').reset_index() # 每个季度Total 可以分摊的数量 df_ByBookingTime_All_From2007_Total = mydf.groupby([ Grouper(key='财务入账时间', freq='BQ') ])['Total Cost'].sum().rename('合计Cost').reset_index() return (df_ByBookingTime_All_From2007_ByQ, df_ByBookingTime_All_From2007_BySPL, df_ByBookingTime_All_From2007_Total)
def _split_traj(self, traj, mode='day', min_length=0): result = [] if mode == 'day': grouped = traj.df.groupby(Grouper(freq="D")) elif mode == 'month': grouped = traj.df.groupby(Grouper(freq="M")) elif mode == 'year': grouped = traj.df.groupby(Grouper(freq="Y")) else: raise ValueError('Invalid split mode {}. Must be one of [day, month, year]'.format(mode)) for key, values in grouped: if len(values) > 1: result.append(Trajectory(values, '{}_{}'.format(traj.id, key))) return TrajectoryCollection(result, min_length=min_length)
def plot_stacked(): cols = [ 'Coal_MW', 'Gas_MW', 'Hidroelectric_MW', 'Nuclear_MW', 'Wind_MW', 'Solar_MW', 'Biomass_MW' ] leg = [] last_means = np.zeros(97) x = [] xyrs = [] for c in cols: groups = traind[[c, "NewDate"]].groupby(Grouper(key='NewDate', freq="M")) m = [] for i, (name, group) in enumerate(groups): m.append(np.mean(group[[c]].values.flatten())) if c == "Coal_MW" and name.month == 1: xyrs.append(name.year) x.append(i) plt.bar(range(97), m, 0.5, bottom=last_means) last_means += m leg.append(c) plt.ylabel('Production_MW') plt.title('Production') plt.gca().set_xticks(x) plt.gca().set_xticklabels(xyrs) plt.gcf().set_size_inches(25, 10) plt.legend(leg) plt.show()
def init(): output_name = "boxplot_speed_diff_2019_04_dataset" one_series = series["speed_diff"] # one_year = one_series["2019"] # groups = one_year.groupby(Grouper(freq="M")) # months = concat([DataFrame(x[1].values) for x in groups], axis=1) # months = DataFrame(months) # months.columns = range(1, 13) # groups.plot() one_month = one_series["2019-04"] groups = one_month.groupby(Grouper(freq="D")) days = concat([DataFrame(x[1].values) for x in groups], axis=1) days = DataFrame(days) days.columns = range(1, 31) pyplot.gcf().set_size_inches(12, 7) days.boxplot() try: pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300) except FileNotFoundError: os.makedirs(OUTPUT_FOLDER) pyplot.savefig(os.path.join(OUTPUT_FOLDER, f"{output_name}.png"), format="png", dpi=300)
def plot_grouped_by_year_data(df_weekly, title): """ Plots weekly combined series (price series and cot report) using Plotly. Use trading weeks on X axis """ x_series = np.arange(0, 54) groups = df_weekly['Settle'].groupby(Grouper(freq='A')) fig = tls.make_subplots(rows=sum(1 for (name, grp) in groups if len(grp.values) >= 52), cols=1, shared_xaxes=True, print_grid=False) fig['layout'].update(height=600, width=899, title=title) for i, (name, group) in enumerate([(name, grp) for (name, grp) in groups if len(grp.values) >= 52]): # chart only data where we have full year (52 weeks) if len(group.values) >= 52: fig.append_trace( { 'x': x_series, 'y': group.values, 'type': 'scatter', 'name': name.year }, i + 1, 1) fig['layout']['yaxis' + str((i + 1))].update(showticklabels=False) cf.iplot(fig)
def test_series_groupby_value_counts_with_grouper(): # GH28479 df = DataFrame({ "Timestamp": [ 1565083561, 1565083561 + 86400, 1565083561 + 86500, 1565083561 + 86400 * 2, 1565083561 + 86400 * 3, 1565083561 + 86500 * 3, 1565083561 + 86400 * 4, ], "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], }).drop([3]) df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212 result = dfg["Food"].value_counts().sort_index() expected = dfg["Food"].apply(Series.value_counts).sort_index() expected.index.names = result.index.names tm.assert_series_equal(result, expected)
def groupFridgeData(fridgeData, freq): fridgeGrouped = fridgeData.groupby(Grouper(freq=freq, axis=0)) intervalLength = float(freq.replace("s", "")) fridgeGrouped = fridgeGrouped.agg( lambda x: np.nan if (hasZerosOrNan(x)) else x.mean() * intervalLength) fridgeGrouped = fridgeGrouped.round(2) fridgeGrouped.rename({'power': 'energy'}, axis=1, inplace=True) return fridgeGrouped
def RV(self): try: rv = self.stock_data['log_returns'] except: self.returns(method='LOG') rv = self.stock_data['log_returns'] return self.stock_data.groupby(Grouper(freq='M')).apply( self.realized_volatility).log_returns * np.sqrt(12)
def add_data_dataframe(self, metadata, data, data_type=None, overwrite=False): """ Add data to this Datasource and segment the data by size. The data is stored as a tuple of the data and the daterange it covers. Args: metadata (dict): Metadata on the data for this Datasource data (Pandas.DataFrame): Data data_type (str, default=None): Placeholder for combination of this fn with add_footprint_data in the future overwrite (bool, default=False): Overwrite existing data None """ from pandas import Grouper from HUGS.Processing import get_split_frequency # Store the metadata as labels # for k, v in metadata.items(): # self.add_metadata(key=k, value=v) # Ensure metadata values are all lowercase metadata = {k: v.lower() for k, v in metadata.items()} self._metadata.update(metadata) # Add in a type record for timeseries data # Can possibly combine this function and the add_footprint (and other) # functions in the future # Store the hashes of data we've seen previously in a dict? # Then also check that the data we're trying to input doesn't overwrite the data we # currently have # Be easiest to first check the dates covered by the data? # Check the daterange covered by this data and if we have an overlap # if self._data: # Exisiting data in Datsource start_data, end_data = self.daterange() # This is the data that we may want to add to the Datasource start_new, end_new = self.get_dataframe_daterange(data) # Check if there's overlap of data if start_new >= start_data and end_new <= end_data and overwrite is False: raise ValueError( "The provided data overlaps dates covered by existing data" ) # Need to check here if we've seen this data before freq = get_split_frequency(data) # Split into sections by splitting frequency group = data.groupby(Grouper(freq=freq)) # Create a list tuples of the split dataframe and the daterange it covers # As some (years, months, weeks) may be empty we don't want those dataframes self._data = [ (g, self.get_dataframe_daterange(g)) for _, g in group if len(g) > 0 ] self.add_metadata(key="data_type", value="timeseries") self._data_type = "timeseries" # Use daterange() to update the recorded values self.update_daterange()
def showBoxPlot(df): groups = df.groupby(Grouper(freq='A')) years = DataFrame() for name, group in groups: years[name.year] = group.values years.boxplot() pyplot.show()
def main(): # Update these as needed filter_pctl = 0.25 workspace = '/home/arthur/Dropbox/projects/greenland/aoi_albedo_time_series/catchments/' if workspace[:-1] != '/': workspace = workspace + '/' csv_name = 'actual_albedo_catchment_6.2_ekholm_stats.csv' aoi_name = 'Catchment 6.2 (ValiObs filter = {x} Pctl)'.format( x=filter_pctl) dt_indx = pd.date_range('2000-01-01', '2020-12-31') csv_path = workspace + csv_name # set this value to filter out observations with fewer than the given percentile of valid observations # Define the fields of interest so we can ignore the rest fields = ['date', 'mean', 'valid_pixels_count'] # Import raw APPEARS output ts_df = pd.read_csv(csv_path, usecols=fields, parse_dates=[1]) # Make the date index, then group by it to make monthly averages ts_df['date'] = pd.to_datetime(ts_df['date']) ts_df['mean'].replace({pd.NaT: np.nan}, inplace=True) ts_df = filter_data_numobs(ts_df, filter_pctl) del ts_df['valid_pixels_count'] # Simple masking by month due to small available pixels, so noisy even when szn-masked begin_month = 3 end_month = 9 szn_mask = (ts_df['date'].dt.month >= begin_month) & (ts_df['date'].dt.month <= end_month) ts_df = ts_df.loc[szn_mask] ts_df.set_index('date', inplace=True) series = ts_df.squeeze() strt_year = dt_indx[0].to_pydatetime().year # do I need this stuff end_year = dt_indx[-1].to_pydatetime().year # and this? nyears = end_year - strt_year # and this? series = series.reindex(dt_indx, fill_value=np.NaN) groups = series.groupby(Grouper(freq='A')) years = DataFrame() # This is how the dataframe is set up with each column being a year of data, each row a doy for name, group in groups: years[name.year] = group.values[:364] # make columns into strings for easier plot labeling years.columns = years.columns.astype(str) box_plot(years, aoi_name, csv_path) # box_plot_anom(years, aoi_name, csv_path) # vert_stack_plot(years, nyears, strt_year, end_year, aoi_name, csv_path) # vert_stack_plot_anom(years, nyears, strt_year, end_year, aoi_name, csv_path) year_vs_avg_plot(years, aoi_name, csv_path)
def _split_traj(self, traj, mode="day", min_length=0): result = [] modes = {"hour": "H", "day": "D", "month": "M", "year": "Y"} if mode in modes.keys(): mode = modes[mode] grouped = traj.df.groupby(Grouper(freq=mode)) for key, values in grouped: if len(values) > 1: result.append(Trajectory(values, "{}_{}".format(traj.id, key))) return TrajectoryCollection(result, min_length=min_length)
def _transform(ts: Series, attr_period: str, fun_type: str) -> Series: """ Groups a timeseries by a specified time period. :param ts: Series. Timeseries. :param attr_period: String. Attribute period, PER_DAY for "d", PER_WEEK for "w", PER_MONTH for "m" and PER_YEAR for "y". :param fun_type: String. Function, either "sum" or "count", to be applied on each of the grouped periods. :return: Series. Grouped timeseries. """ if fun_type == "sum": ts = ts.groupby(Grouper(freq=attr_period.upper())).sum() elif fun_type == "count": ts = ts.groupby(Grouper(freq=attr_period.upper())).count() else: raise NoProperOptionInIf ts = ts[ts != 0] # To drop VALUE = 0 rows return ts
def resample(self, resample_code='A-JUN', fun='mean'): """ Time series resampling function. Returns a Hydro class object with resampled data. Parameters ---------- resample_code : str The Pandas resampling code for the resampling process (e.g. 'A' for annual, 'A-JUN' for annual ending in June (water year), 'D' for day, 'W' for week, 'M' for month') fun : str The function that should be applied. Any function that Pandas can handle in a groupby object. """ from pandas.core.groupby import SeriesGroupBy, GroupBy from pandas import Grouper ### Set up parameters if fun in GroupBy.__dict__.keys(): fun1 = GroupBy.__dict__[fun] elif fun in SeriesGroupBy.__dict__.keys(): fun1 = SeriesGroupBy.__dict__[fun] else: raise ValueError('Need to use the right function name.') ### Run resampling data = self.data df1 = data.groupby([ Grouper(level='mtype'), Grouper(level='site'), Grouper(level='time', freq=resample_code) ]) df2 = fun1(df1) ### Recreate hydro class object new1 = self.add_data(df2.reset_index(), 'time', 'site', 'mtype', 'data', dformat='long', add=False) return (new1)
def group_by_time_frequency(data_x, data_y, columns, g_freq): data_in = pd.DataFrame(columns=columns, index=data_x) data_in['time'] = data_x data_in[columns[1:len(columns)]] = data_y groups = data_in.groupby(Grouper(freq=g_freq)) d_groups = concat([DataFrame(x[1].values) for x in groups], axis=1) d_groups = DataFrame(d_groups) d_groups.columns = range(1, len(d_groups.columns) + 1) return groups, d_groups
def group_by_time_range(data_x, data_y, g_freq): series = pd.DataFrame(columns=['Date', 'DO'], index=data_x) series['Date'] = data_x series['DO'] = data_y groups = series.groupby(Grouper(freq=g_freq)) g_data = concat([DataFrame(x[1].values) for x in groups], axis=1) g_data = DataFrame(g_data) g_data.columns = range(1, len(g_data.columns) + 1) return groups, g_data
def __init__(self, symbol, timeframe, agent, fraction=(0, 0), equity=100, risk=0.1): self.symbol = symbol self.timeframe = timeframe new_file = 'D:/Data/' + symbol + '_' + timeframe + '.csv' self.data = read_csv(new_file, header=0, parse_dates=False, skiprows=range(1, fraction[0] + 1), nrows=fraction[1]) # Generating Dataframes time = to_datetime(self.data['Date'], format='%Y%m%d %H:%M:%S', errors='coerce') self.data = self.data.iloc[:, 2:] self.data['Date'] = time self.data, n = get_technical_indicators(self.data) self.data['Index'] = self.data.index - n # Daily grouping self.days = [ g for n, g in self.data.groupby(Grouper(key='Date', freq='D')) ] self.data = self.data.set_index(time.iloc[n:]) self.l = len(self.data) self.trades = DataFrame(columns=[ 'Opentime', 'Closetime', 'Openindex', 'Closeindex', 'Open', 'Close', 'Profit', 'Volume', 'Direction' ]) # Properties self.market = { 'Spread': 1.2, 'MinLot': 0.1, 'Leverage': 20, 'MarginCall': 0.75 } self.settings = {'Risk': risk, 'MaxDrawDown': 0.9, 'TradingPause': 0} self.equity_start = equity self.equity = equity self.agent = agent self.profits = [] self.actions = [] print('Initialized.')
def __aggregate(self, df, freq): freq_int = int(re.search(r'\d+', freq).group()) freq_dim = freq.replace(str(freq_int), "") new_freq = str(int(freq_int / self.scale_factor_)) + freq_dim df = df.groupby(Grouper(key='date', freq=new_freq))['size'].sum() df = df.fillna(0) df = pd.DataFrame(df) df['diff'] = np.arange(0, len(df)) df['diff'] *= freq_int return df
def grp_ts_agg(df, grp_col, ts_col, freq_code): """ Simple function to aggregate time series with dataframes with a single column of sites and a column of times. Arguments:\n df -- dataframe with a datetime column.\n grp_col -- Column name that contains the sites.\n ts_col -- The column name of the datetime column.\n freq_code -- The pandas frequency code for the aggregation (e.g. 'M', 'A-JUN').\n agg_fun -- Either 'mean' or 'sum'. """ df1 = df.copy() if type(df[ts_col].iloc[0]) is Timestamp: df1.set_index(ts_col, inplace=True) if type(grp_col) is list: grp_col.extend([Grouper(level=ts_col, freq=freq_code)]) else: grp_col = [grp_col, Grouper(level=ts_col, freq=freq_code)] df_grp = df1.groupby(grp_col) return df_grp else: print('Make one column a timeseries!')
def df_weekly_generation(df): """ ---What it does--- + Generates a dataframe by country and date. ---What it needs--- + big_df: dataframe with columns. + condition_1: name of the country. + condition_2: starting date. ---What it returns--- + country_df """ new_df = df.groupby(Grouper(key="date", freq='7D')).mean() new_df["week"] = range(0, len(new_df)) new_df.set_index("week", inplace=True) return new_df
def get_all_multi_strengths(df_strength): pickle_path_str_agg_all = 'all_strengths' if os.path.isfile(pickle_path_str_agg_all): with open(pickle_path_str_agg_all, 'rb') as handle: df_multi_strength = pickle.load(handle) else: df_strength = process_timestamp_and_clean_data(df_strength) df_by_user_date = df_strength.groupby( ['user_id', Grouper(key='timestamp', freq='D')]) df_multi_strength = df_by_user_date.apply(transform_to_multi_strength) df_multi_strength = df_multi_strength.drop_duplicates() with open(pickle_path_str_agg_all, "wb") as handle: pickle.dump(df_multi_strength, handle, protocol=pickle.HIGHEST_PROTOCOL) return df_multi_strength
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'],format='%m/%d/%Y %H:%M')#2015-01-31 23:00:09 df['Lpep_dropoff_datetime'] = pd.to_datetime(df['Lpep_dropoff_datetime'],format='%m/%d/%Y %H:%M') df['trip_duration']= df['Lpep_dropoff_datetime'] - df['lpep_pickup_datetime'] data = df[~(df['lpep_pickup_datetime'] < '2014-01-01')] data = data[~(df['lpep_pickup_datetime'] > '2014-02-01')] data['count']=1 #print(data) #print(data) reqcolumns = ['Trip_distance', 'Total_amount'] reqcolumns2 = ['trip_duration', 'Total_amount'] data_distance=data[reqcolumns] data_time = data[reqcolumns2] #print(data_time) by_days=data.groupby(Grouper(key='lpep_pickup_datetime', freq='d')).sum() by_hour=data.groupby(Grouper(key='lpep_pickup_datetime', freq='H')).sum() print(by_days) # data atribute has non group data # In[130]: X = data.iloc[:, 0].values y = data.iloc[:, 1].values X = X.reshape(-1, 1) y = y.reshape(-1, 1)
# create stacked line plots from pandas import read_csv from pandas import DataFrame from pandas import Grouper from matplotlib import pyplot filename = 'daily-minimum-temperatures.csv' series = read_csv(filename, header=0, index_col=0, parse_dates=True, squeeze=True) groups = series.groupby(Grouper(freq='A')) years = DataFrame() for name, group in groups: years[name.year] = group.values years.plot(subplots=True, legend=False) pyplot.show()
#mpl.style.use('fivethirtyeight') os.chdir(r'C:\Users\Administrator\Documents\GitHub\DS\Hessel, TimeSeries') series = read_csv(r'data\MAAND OPEN PRD 2014-2019.csv', header=0, index_col=0, parse_dates=True, squeeze=True) pp.figure(figsize=(8, 3), dpi=100) pp.title('Openstaande Incidenten Per Maand') series.plot() pp.xlabel('jaren') pp.ylabel('incidenten') pp.tight_layout(pad=3.0) pp.show() groups = series['2014':'2019'].groupby(Grouper(freq='A')) years = DataFrame() for name, group in groups: years[name.year] = group.values # Box and Whisker Plots pp.figure(figsize=(6, 4), dpi=100, edgecolor='k') years.boxplot() pp.title('Trend') pp.tight_layout(pad=3.0) pp.show() years = years.transpose() pp.figure(figsize=(6, 4), dpi=100, edgecolor='k') years.boxplot() pp.tight_layout(pad=3.0) pp.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [
airline.describe() #### Histogram and Density Plots # create a histogram plot airline.hist() # create a density plot airline.plot(kind='kde') #### Box and Whisker Plots by Interval # create a boxplot of yearly data from pandas import Grouper groups = airline.groupby(Grouper(freq='A')) years = pd.DataFrame() for name, group in groups: years[name.year] = group.values years.boxplot() #### Lag plot # create a scatter plot from pandas.plotting import lag_plot lag_plot(airline, lag=1) # for 1st lag. by default its 1 # create an autocorrelation plot from statsmodels.graphics.tsaplots import plot_acf
list_of_people.columns = columns # print(list_of_people.head()) list_of_people.to_csv('people.csv', index=0) # print(list_of_people) csvfile = pd.read_csv('people.csv') rounded_age = csvfile.loc[:, 'age'].round(-1) # print(csvfile.columns) # print(rounded_age) csvfile.loc[:, 'rounded_age'] = csvfile.loc[:, 'age'].round(-1) grouped_age = csvfile.groupby(Grouper(key='rounded_age')).count() # print(grouped_age) rounded_weight = csvfile.loc[:, 'weight'].round(-1) # print(rounded_weight) csvfile.loc[:, 'rounded_weight'] = csvfile.loc[:, 'weight'].round(-1) grouped_weight = csvfile.groupby(Grouper(key='rounded_weight')).count() # print(rounded_weight) rounded_height = csvfile.loc[:, 'height'].round(-1) # print(rounded_height) csvfile.loc[:, 'rounded_height'] = csvfile.loc[:, 'height'].round(-1)
def test_pivot_timegrouper(self): df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 1, 1), datetime(2013, 1, 1), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 12, 2), datetime(2013, 12, 2), ] }).set_index('Date') expected = DataFrame(np.array([10, 18, 3], dtype='int64').reshape(1, 3), index=[datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) expected = DataFrame( np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( KeyError, lambda: pivot_table(df, index=Grouper(freq='6MS', key='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( KeyError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( ValueError, lambda: pivot_table(df, index=Grouper(freq='6MS', level='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( ValueError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 11, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 12, 5, 14, 0) ], 'PayDay': [ datetime(2013, 10, 4, 0, 0), datetime(2013, 10, 15, 13, 5), datetime(2013, 9, 5, 20, 0), datetime(2013, 11, 2, 10, 0), datetime(2013, 10, 7, 20, 0), datetime(2013, 9, 5, 10, 0), datetime(2013, 12, 30, 12, 0), datetime(2013, 11, 20, 14, 0), ] }) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) expected = DataFrame(np.array([ np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan ]).reshape(4, 4), index=[ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31) ], columns=[ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31) ]) expected.index.name = 'Date' expected.columns.name = 'PayDay' tm.assert_frame_equal(result, expected) result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), columns=Grouper(freq='M', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) tuples = [ (datetime(2013, 9, 30), datetime(2013, 10, 31)), (datetime(2013, 10, 31), datetime(2013, 9, 30)), (datetime(2013, 10, 31), datetime(2013, 11, 30)), (datetime(2013, 10, 31), datetime(2013, 12, 31)), (datetime(2013, 11, 30), datetime(2013, 10, 31)), (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array( [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' result = pivot_table(df, index=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], columns=['Branch'], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=['Branch'], columns=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T)