Esempio n. 1
0
def get_capacity(data_df, qtiles, time_fmt, tm_col, tps_col, rt_col, capacity_window, xexp=1.0, name=None):
    """
    Daily capacity
    :param data_df: DF with all data
    :param qtiles: number of quantiles used in the capacity model
    :param capacity_window: number of workdays used to compute the capacity.
    :param xexp: exponent for generalized power function
    :param time_fmt: str format of time col
    :param tm_col: time column name
    :param tps_col: throughput columns name
    :param rt_col: response time column name
    :param name: name of the system
    :return:
    """
    cap_df = pd.DataFrame(columns=['day', tps_col, rt_col, 'traffic', 'rsq_adj'])
    data_df['day'] = data_df[tm_col].apply(lambda x: tm_ut.change_format(x, in_format=time_fmt, out_format='%Y-%m-%d'))
    dates = data_df['day'].sort_values().unique()
    row, last_result = 0, None
    for date in dates:
        start_dates = list(dates[dates <= date])
        if len(start_dates) >= capacity_window:
            start_date = start_dates[-capacity_window]
        elif len(start_dates) > 0:
            start_date = start_dates[0]
        else:
            start_date = date
        df = data_df[(start_date <= data_df['day']) & (data_df['day'] <= date)].copy()     # tm_col, users_col, tput_col, rtm_col, traffic
        if len(df) > 0:
            result = tpe_capacity.tput_engine_cap(df, tps_col, rt_col, qtiles, xexp=xexp, date=date, name=name)
            cap_df.loc[row] = [date] + result
            row += 1

    cap_df.columns = ['day', tps_col + '_cap', rt_col + '_cap', 'traffic_cap', 'rsq_adj']
    return cap_df
Esempio n. 2
0
def df_check(a_df, x_col, r_col, n_col, u_col, agg=None):
    a_df.reset_index(inplace=True, drop=True)

    if x_col not in a_df.columns or r_col not in a_df.columns or n_col not in a_df.columns:
        print 'invalid columns: ' + str(x_col) + ' ' + str(r_col) + ' ' + str(n_col) + ' ' + str(a_df.columns)
        sys.exit(0)

    for c in [x_col, r_col]:
        if all(a_df[c].isnull()) is True:
            print 'invalid values in ' + str(c)
            sys.exit(0)

    if any(a_df[n_col].isnull()) is True:
        print 'invalid values in ' + str(n_col)
        sys.exit(0)

    # find time col and secs between rows
    time_fmt, time_col = None, None
    s = a_df.ix[a_df.index[0],]
    for c in s.index:
        time_fmt = tm_ut.get_date_format(s[c])
        if time_fmt is not None:
            time_col = c
            break

    time_secs = get_time_secs(a_df[time_col].copy(), time_fmt)
    if time_fmt is None or time_secs is None:
        print 'invalid time info'
        print a_df.head()
        sys.exit(0)

    a_df[r_col] /= 1000.0     # all our response times are in msecs
    a_df[x_col] /= time_secs  # tput always in tps
    a_df[u_col] /= time_secs  # look at users per sec also, mainly to avoid huge values

    # time aggregation
    agg_val = None if agg not in ['hour', 'day'] else agg
    if agg_val is None:
        out_df = a_df[[n_col, time_col, x_col, r_col, u_col]].copy()
        t_fmt = time_fmt
    else:                  # hour or day aggregation
        t_fmt = '%Y-%m-%d-%H' if agg_val == 'hour' else '%Y-%m-%d'
        a_df[time_col] = a_df[time_col].apply(lambda x: tm_ut.change_format(x, in_format=time_fmt, out_format=t_fmt))
        a_df['x*r'] = a_df[x_col] * a_df[r_col]
        agg_df = a_df.groupby([time_col, n_col]).agg({
            x_col: {x_col + '_avg': np.mean, x_col + '_sum': np.sum},
            u_col: {u_col + '_avg': np.mean, u_col + '_sum': np.sum},
            'x*r': np.sum
        }).reset_index()
        out_df = pd.DataFrame(columns=[n_col, time_col, x_col, r_col, u_col])
        out_df[r_col] = agg_df[('x*r', 'sum')] / agg_df[(x_col, x_col + '_sum')]       # response time is an weighted avg
        out_df[x_col] = agg_df[(x_col, x_col + '_avg')]
        out_df[u_col] = agg_df[(u_col, u_col + '_avg')]
        out_df[time_col] = agg_df[time_col]
        out_df[n_col] = agg_df[n_col]

    out_df['time_col'] = time_col
    out_df['time_fmt'] = t_fmt
    return out_df