def calc_month(df, month_type, build_type): start_day = week_extract.get_min_month() if build_type == TRAIN and month_type == MONTH1: df_month = df[(df['date'] >= start_day) & (df['date'] < timeOpt.add_months(start_day, 1))] elif (build_type == TRAIN and month_type == MONTH2) or (build_type == PREDICT and month_type == MONTH1): df_month = df[(df['date'] >= timeOpt.add_months(start_day, 1)) & (df['date'] < timeOpt.add_months(start_day, 2))] else: df_month = df[(df['date'] >= timeOpt.add_months(start_day, 2)) & (df['date'] < timeOpt.add_months(start_day, 3))] group_month1 = day_extract.groupby_calc(df_month).apply( month_extract.calc_month_data).reset_index(drop=True) df = pd.DataFrame(group_month1) \ .rename(columns={'activeDay': 'activeDayMonth' + month_type, 'totalConnectTime': 'totalConnectTimeMonth' + month_type, 'dlTraffic': 'dlTrafficMonth' + month_type, 'ulTraffic': 'ulTrafficMonth' + month_type, 'totalDlUlTrafficMonthly': 'totalDlUlTrafficMonthly' + month_type, 'dlTrafficRatioMonthly': 'dlTrafficRatioMonthly' + month_type, 'totalDlUlTrafficPerday': 'totalDlUlTrafficPerdayMonth' + month_type, 'dlTrafficPerday': 'dlTrafficPerdayMonth' + month_type, 'ulTrafficPerday': 'ulTrafficPerdayMonth' + month_type, 'totalConnectTimePerday': 'totalConnectTimePerdayMonth' + month_type, 'MinRSRP': 'MinRSRPMonth' + month_type, 'MaxRSRP': 'MaxRSRPMonth' + month_type, 'AvgRSRP': 'AvgRSRPMonth' + month_type, 'StdRSRP': 'StdRSRPMonth' + month_type, 'MinSINR': 'MinSINRMonth' + month_type, 'MaxSINR': 'MaxSINRMonth' + month_type, 'AvgSINR': 'AvgSINRMonth' + month_type, 'StdSINR': 'StdSINRMonth' + month_type}) return df
def get_pre_month(): min_month = week_extract.get_min_month() month_list = [(1, timeOpt.add_months(min_month, 1), timeOpt.add_months(min_month, 2)), (2, timeOpt.add_months(min_month, 2), timeOpt.add_months(min_month, 3))] return month_list
def get_post_df(): min_month = week_extract.get_min_month() all_file = week_extract.get_file_by_range(timeOpt.add_months(min_month, 2), timeOpt.add_months(min_month, 3)) df = pd.DataFrame( columns=setting.parameter_json["post_eva_from_day_column_name"]) for f in all_file: file_df = pd.read_csv(f, error_bad_lines=False, index_col=False)[ setting.parameter_json["post_eva_from_day_column_name"]] df = df.append(file_df) return df
def build_feature(df): start_day = week_extract.get_min_month() df_train = df[(df['date'] >= start_day) & (df['date'] < timeOpt.add_months(start_day, 2))] df_pre = df[(df['date'] >= timeOpt.add_months(start_day, 1)) & (df['date'] < timeOpt.add_months(start_day, 3))] df_for_churn = set(df[(df['date'] >= timeOpt.add_months(start_day, 2)) & (df['date'] < timeOpt.add_months(start_day, 3))] ['esn'].astype('str').values) compress.empty_folder(setting.model_path) train_result_df = build(df_train, df_for_churn, TRAIN) train_result_df.to_csv(os.path.join(setting.model_path, r"trainData.csv"), index=False) pre_result_df = build(df_pre, df_for_churn, PREDICT) pre_result_df[(pre_result_df['churnLabel'] < 1)].to_csv(os.path.join( setting.model_path, r"predictData.csv"), index=False) return 0
def run(): multiprocessing.freeze_support() start_day = week_extract.get_min_month() df = get_data_by_range(start_day, timeOpt.add_months(start_day, 3)) build_feature(df)