def get_capacity(data_df, qtiles, time_fmt, tm_col, tps_col, rt_col, capacity_window, xexp=1.0, name=None): """ Daily capacity :param data_df: DF with all data :param qtiles: number of quantiles used in the capacity model :param capacity_window: number of workdays used to compute the capacity. :param xexp: exponent for generalized power function :param time_fmt: str format of time col :param tm_col: time column name :param tps_col: throughput columns name :param rt_col: response time column name :param name: name of the system :return: """ cap_df = pd.DataFrame(columns=['day', tps_col, rt_col, 'traffic', 'rsq_adj']) data_df['day'] = data_df[tm_col].apply(lambda x: tm_ut.change_format(x, in_format=time_fmt, out_format='%Y-%m-%d')) dates = data_df['day'].sort_values().unique() row, last_result = 0, None for date in dates: start_dates = list(dates[dates <= date]) if len(start_dates) >= capacity_window: start_date = start_dates[-capacity_window] elif len(start_dates) > 0: start_date = start_dates[0] else: start_date = date df = data_df[(start_date <= data_df['day']) & (data_df['day'] <= date)].copy() # tm_col, users_col, tput_col, rtm_col, traffic if len(df) > 0: result = tpe_capacity.tput_engine_cap(df, tps_col, rt_col, qtiles, xexp=xexp, date=date, name=name) cap_df.loc[row] = [date] + result row += 1 cap_df.columns = ['day', tps_col + '_cap', rt_col + '_cap', 'traffic_cap', 'rsq_adj'] return cap_df
def df_check(a_df, x_col, r_col, n_col, u_col, agg=None): a_df.reset_index(inplace=True, drop=True) if x_col not in a_df.columns or r_col not in a_df.columns or n_col not in a_df.columns: print 'invalid columns: ' + str(x_col) + ' ' + str(r_col) + ' ' + str(n_col) + ' ' + str(a_df.columns) sys.exit(0) for c in [x_col, r_col]: if all(a_df[c].isnull()) is True: print 'invalid values in ' + str(c) sys.exit(0) if any(a_df[n_col].isnull()) is True: print 'invalid values in ' + str(n_col) sys.exit(0) # find time col and secs between rows time_fmt, time_col = None, None s = a_df.ix[a_df.index[0],] for c in s.index: time_fmt = tm_ut.get_date_format(s[c]) if time_fmt is not None: time_col = c break time_secs = get_time_secs(a_df[time_col].copy(), time_fmt) if time_fmt is None or time_secs is None: print 'invalid time info' print a_df.head() sys.exit(0) a_df[r_col] /= 1000.0 # all our response times are in msecs a_df[x_col] /= time_secs # tput always in tps a_df[u_col] /= time_secs # look at users per sec also, mainly to avoid huge values # time aggregation agg_val = None if agg not in ['hour', 'day'] else agg if agg_val is None: out_df = a_df[[n_col, time_col, x_col, r_col, u_col]].copy() t_fmt = time_fmt else: # hour or day aggregation t_fmt = '%Y-%m-%d-%H' if agg_val == 'hour' else '%Y-%m-%d' a_df[time_col] = a_df[time_col].apply(lambda x: tm_ut.change_format(x, in_format=time_fmt, out_format=t_fmt)) a_df['x*r'] = a_df[x_col] * a_df[r_col] agg_df = a_df.groupby([time_col, n_col]).agg({ x_col: {x_col + '_avg': np.mean, x_col + '_sum': np.sum}, u_col: {u_col + '_avg': np.mean, u_col + '_sum': np.sum}, 'x*r': np.sum }).reset_index() out_df = pd.DataFrame(columns=[n_col, time_col, x_col, r_col, u_col]) out_df[r_col] = agg_df[('x*r', 'sum')] / agg_df[(x_col, x_col + '_sum')] # response time is an weighted avg out_df[x_col] = agg_df[(x_col, x_col + '_avg')] out_df[u_col] = agg_df[(u_col, u_col + '_avg')] out_df[time_col] = agg_df[time_col] out_df[n_col] = agg_df[n_col] out_df['time_col'] = time_col out_df['time_fmt'] = t_fmt return out_df