Ejemplo n.º 1
0
def regression(col_y, col_x, lag, season_tf, num_day, day_of_week, df):
    # Get dataframe for col1
    dates_1 = lib.generate_season_date_by_dof(season_tf, num_day, day_of_week,
                                              df['date'])
    df1 = lib.get_df_by_dates(dates_1, df)

    # Get dataframe for col2
    dates_2 = lib.get_dates_lag(dates_1, lag)
    df2 = lib.get_df_by_dates(dates_2, df)
    col1_data = df2[col_x]
    col2_data = df1[col_y]

    col1_data_rm, col2_data_rm = lib.remove_outlier(col1_data, col2_data)
    regr = linear_model.LinearRegression()
    col1_data_rm = col1_data_rm.reshape(len(col1_data_rm), 1)
    try:
        regr.fit(col1_data_rm, col2_data_rm)
        result = {
            'coef': regr.coef_[0],
            'intercept': regr.intercept_,
            'score': regr.score(col1_data_rm, col2_data_rm)
        }
    except:
        print(dates_1)
        print(dates_2)
        print(col1_data, col2_data)
        result = {'coef': 0, 'intercept': 0}
        raise
    return result
Ejemplo n.º 2
0
def compute_corr_ver_2_week_1(dep_id, seg_id, season_tf, significance_level,
                              df, year_input, date_ss):
    y_dates = date_ss
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates, df)
    y_series = df1_full[y_col]
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0
    dict_df = {}
    dict_df["0_2"] = df1_full[df1_full.columns]
    dict_df["0_1"] = df1_full[df1_full.columns]
    for lag in [1, 7, 365]:
        col_dates = lib.get_dates_lag(y_dates, lag)
        df2 = lib.get_df_by_dates(col_dates, df)
        dict_df[str(lag) + "_2"] = df2[df2.columns]
        dict_df[str(lag) + "_1"] = df1_full[(len(df1_full) - len(df2)):]
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            df1 = dict_df[str(lag) + "_1"]
            df2 = dict_df[str(lag) + "_2"]
            col_series = df2[col]
            y_series_rm = y_series[-len(col_series):]
            y_series_rm, col_series = lib.remove_outlier(
                y_series_rm, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2,
                                                cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series,
                                            cap_arr['capture'], cap_arr['ARR'])
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
Ejemplo n.º 3
0
def get_series_to_compute_corr(y_col_name, x_col_name, lag, y_dates, df):
    """
    Prepare series to compute correlation
    :param y_col_name: name of column y
    :param x_col_name: name of column x
    :param lag: lag days between y_col and x_col
    :param y_dates: dates series by y_col
    :param df: dataframe contain all data of one hotel
    :return:
    """
    col_dates = lib.get_dates_lag(y_dates, lag)
    y_series = lib.get_df_by_dates(y_dates, df)[y_col_name]
    col_series = lib.get_df_by_dates(col_dates, df)[x_col_name]
    # if length of x_col data smaller than of y_col just return data by length x_col
    return y_series[:len(col_series)], col_series
Ejemplo n.º 4
0
def compute_corr_ver_3_week_1(dep_id, seg_id, season_tf, significance_level,
                              df, date_ss):
    """
    Compute correlation for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """
    y_dates = date_ss
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    y_series = lib.get_df_by_dates(y_dates, df)[y_col]
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0

    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 14, 21, 30, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            col_dates = lib.get_dates_lag(y_dates, lag)
            col_series = lib.get_df_by_dates(col_dates, df)[col]
            y_series_rm, col_series = lib.remove_outlier(y_series, col_series)
            cov_val = np.corrcoef(y_series_rm, col_series)[1, 0]
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
Ejemplo n.º 5
0
def get_df_corr_data(dep_id, seg_id, season_tf, day_of_week, df):
    """
    Get data to test. It's consistent with the ways getting data in compute_corr
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :return:
    """
    dates = df['date'].sort_values(ascending=False)
    season_dates = pd.Series(lib.get_season_dates_from_tf(season_tf, dates))
    y_dates = list(season_dates[season_dates.dt.weekday == day_of_week])
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    cols = set(df.columns) - {'date', 'day_of_week'}
    y_series = lib.get_df_by_dates(y_dates, df)[y_col]
    df_result = pd.DataFrame({y_col: list(y_series)})
    for lag in [0, 1, 7, 14, 21, 30, 365]:
        for i, col in enumerate(cols):
            df_result["date_{}".format(lag)] = lib.get_dates_lag(y_dates, lag)
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            y_series, col_series = get_series_to_compute_corr(
                y_col, col, lag, y_dates, df)
            if len(col_series) < len(y_dates):
                df_result["{}_{}".format(
                    col, lag)] = list(col_series) + ['F'] * (len(y_dates) -
                                                             len(col_series))
            else:
                df_result["{}_{}".format(col, lag)] = list(col_series)
    return df_result
Ejemplo n.º 6
0
def compute_capture_arr_week(col1, col2, lag, season_tf, df, year_input):
    """ neeed to change the comment later
    Compute capture and arr for original column col1 and lag column col2
    :param col1: dep_seg id of original column
    :param col2: dep_seg id of lag column
    :param lag:
    :param season_tf: season time frame
    :param day_of_week:
    :param df: dataframe contain data of one hotel
    :return: dictionary contain 'capture', 'ARR', 'capture_unit', 'ARR_unit'
    """
    # Get dataframe for col1
    dates_1 = lib.generate_season_date(season_tf, year_input)
    df1 = lib.get_df_by_dates(dates_1, df)

    # Get dataframe for col2
    dates_2 = lib.get_dates_lag(dates_1, lag)
    df2 = lib.get_df_by_dates(dates_2, df)
    root_col1 = lib.get_root_col(col1)
    root_col2 = lib.get_root_col(col2)
    type_col2 = lib.get_type_col(col2)
    col1_rv = "{}_rv".format(root_col1)
    col1_rn = "{}_rn".format(root_col1)
    col2_rv = "{}_rv".format(root_col2)
    col2_rn = "{}_rn".format(root_col2)
    col2_gn = "{}_gn".format(root_col2)

    capture = 1
    ARR = 0
    capture_unit = 1
    ARR_unit = 0

    try:
        if root_col1 == root_col2:
            if col1_rn in df.columns:  # exist unit ?
                capture = 1
                col1_rn_rm, col2_rn_rm = lib.remove_outlier(
                    df1[col1_rn], df2[col2_rn])
                ARR = (col1_rn_rm / col2_rn_rm).replace(
                    [np.inf, -np.inf], np.nan).dropna().median()
                capture_unit = 1
                ARR_unit = (col1_rn_rm / col2_rn_rm).replace(
                    [np.inf, -np.inf], np.nan).dropna().median()
            else:
                capture = 1
                col1_rv_rm, col2_rm = lib.remove_outlier(
                    df1[col1_rv], df2[col2])
                ARR = (col1_rv_rm / col2_rm).replace([np.inf, -np.inf],
                                                     np.nan).dropna().median()
                capture_unit = 1
                ARR_unit = (col1_rv_rm / col2_rm).replace(
                    [np.inf, -np.inf], np.nan).dropna().median()
        else:
            if col1_rn in df.columns:  # exist unit ?
                if type_col2 == 'rv':
                    capture = 1
                    col1_rv_rm, col2_rv_rm = lib.remove_outlier(
                        df1[col1_rv], df2[col2_rv])
                    ARR = (col1_rv_rm / col2_rv_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                    capture_unit = 1
                    col1_rn_rm, col2_rv_rm = lib.remove_outlier(
                        df1[col1_rn], df2[col2_rv])
                    ARR_unit = (col1_rn_rm / col2_rv_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                elif type_col2 == 'rn':
                    col1_rn_rm, col2_rn_rm = lib.remove_outlier(
                        df1[col1_rn], df2[col2_rn])
                    capture = (col1_rn_rm / col2_rn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                    col1_rv_rm, col1_rn_rm = lib.remove_outlier(
                        df1[col1_rv], df1[col1_rn])
                    ARR = (col1_rv_rm / col1_rn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                    capture_unit = 1
                    col1_rn_rm, col2_rn_rm = lib.remove_outlier(
                        df1[col1_rn], df2[col2_rn])
                    ARR_unit = (col1_rn_rm / col2_rn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                elif type_col2 == 'gn':
                    col1_rn_rm, col2_gn_rm = lib.remove_outlier(
                        df1[col1_rn], df2[col2_gn])
                    capture = (col1_rn_rm / col2_gn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                    col1_rv_rm, col1_rn_rm = lib.remove_outlier(
                        df1[col1_rv], df1[col1_rn])
                    ARR = (col1_rv_rm / col1_rn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                    capture_unit = 1
                    col1_rn_rm, col2_gn_rm = lib.remove_outlier(
                        df1[col1_rn], df2[col2_gn])
                    ARR_unit = (col1_rn_rm / col2_gn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
            else:
                if type_col2 == 'rv':
                    capture = 1
                    col1_rm, col2_rv_rm = lib.remove_outlier(
                        df1[col1], df2[col2_rv])
                    ARR = (col1_rm / col2_rv_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                elif type_col2 == 'rn':
                    capture = 1
                    col1_rm, col2_rn_rm = lib.remove_outlier(
                        df1[col1], df2[col2_rn])
                    ARR = (col1_rm / col2_rn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
                elif type_col2 == 'gn':
                    capture = 1
                    col1_rm, col2_gn_rm = lib.remove_outlier(
                        df1[col1], df2[col2_gn])
                    ARR = (col1_rm / col2_gn_rm).replace(
                        [np.inf, -np.inf], np.nan).dropna().median()
    except:
        capture = 1
        ARR = 0

    capture = 0 if pd.isnull(capture) else capture
    ARR = 0 if pd.isnull(ARR) else ARR
    ARR_unit = 0 if pd.isnull(ARR_unit) else ARR_unit
    result = {
        'capture': capture,
        'ARR': ARR,
        'capture_unit': capture_unit,
        'ARR_unit': ARR_unit
    }
    return result
Ejemplo n.º 7
0
def compute_corr_ver_2_1_cruise(dep_id, seg_id, season_tf, num_day,
                                day_of_week, significance_level, df, date_ss,
                                cruise):
    """
    Compute explanation (1 -error) for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """
    y_weekdays = cruise.cruiseday_list(date_ss)
    y_dates_full = [
        date_ss[i] for i, d in enumerate(y_weekdays) if d == day_of_week
    ]
    y_dates = y_dates_full[-(min(num_day, len(y_dates_full))):]
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates_full, df)
    y_series = df1_full[y_col]

    df1 = lib.get_df_by_dates(y_dates, df1_full)
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0
    dict_df = {}
    dict_df[0] = df1[df1.columns]
    dict_df["0_full"] = df1_full[df1_full.columns]
    for lag in [1, 7]:  #remove lag 365
        dates = lib.get_dates_lag(y_dates, lag)
        dates_full = lib.get_dates_lag(y_dates_full, lag)
        df2_full = lib.get_df_by_dates(dates_full, df)
        df2 = lib.get_df_by_dates(dates, df2_full)
        dict_df[lag] = df2
        dict_df[str(lag) + "_full"] = df2_full
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7]:  #remove lag 365
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            col_series = dict_df[str(lag) + "_full"][col]
            y_series_r = y_series[-len(col_series):]
            df2 = dict_df[lag]
            y_series_rm, col_series_rm = lib.remove_outlier(
                y_series_r, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2,
                                                cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series_rm,
                                            cap_arr['capture'], cap_arr['ARR'])
            if y_col == "69_107_rv" and day_of_week == 6 and season_tf == [
                ('01-01', '02-17'), ('12-23', '12-31')
            ] and lag == 0:
                print('test', col)
                with open(
                        'test_caset_69_107_{0}_{1}.txt'.format(season_tf, col),
                        'w') as f:
                    f.write("\ny_series: {0}\ncol_series:{1}\ny_rm:{2}\ncol_rm: {3}\ncap_arr: {4}\cov_val:{5}".format( \
                        ", ".join(map(str,y_series_r)), ", ".join(map(str, col_series)), ", ".join(map(str, y_series_rm)), ", ".join(map(str, col_series_rm)), str(cap_arr), cov_val))
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)