class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_ticketrate.txt') self.__params = {} self.name = "module_ticketrate" def run(self, df, global_params=None): # STATUS ==5 的是交易成功的 df['is_success'] = df['order_status'].apply(lambda x: 1 if x == 5 else 0) status = df.groupby(['is_success']).ticket_num.count() st_status = df.groupby(['entry_station']).apply(lambda df: np.mean(df['is_success'])).reset_index() st_status.columns = ['entry_station', 'rate'] st = df.groupby(['entry_station'], as_index=False).ticket_num.count() st_status = st_status.merge(st, on=['entry_station'], how='left') st_status['success_ticket'] = st_status['ticket_num'] * st_status['rate'] st_status['success_ticket'] = st_status['success_ticket'].apply(lambda x: int(round(x))) st_status['fail_ticket'] = st_status['ticket_num'] - st_status['success_ticket'] st_status = st_status.sort_values('rate', ascending=True) # print(st_status.head()) self.__params['M5_total_rate'] = status[1] / status.sum() self.__params['M5_tail_stations'] = st_status.entry_station[:5].tolist() self.__params['M5_success_tk'] = st_status.success_ticket[:5].tolist() self.__params['M5_fail_tk'] = st_status.fail_ticket[:5].tolist() self.__params['M5_rate'] = st_status.rate[:5].tolist() # print(self.__params) params = {} params['M5_total_rate'] = self.__params['M5_total_rate'] params['M5_tail_stations'] = self.__params['M5_tail_stations'] params['M5_success_tk'] = self.__params['M5_success_tk'] params['M5_fail_tk'] = self.__params['M5_fail_tk'] params['M5_rate'] = self.__params['M5_rate'] self.__data = params global_params['M5_total_rate'] = '%.2f'%(params['M5_total_rate']*100) global_params['M5_tail_stations'] = '、'.join(params['M5_tail_stations'][:3]) def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_userstay.txt') self.__params = {} self.name = "module_userstay" def run(self, df, global_params=None): # STATUS ==5 的是交易成功的 df_suc = df[df['order_status'] == 5].copy() single_ft = df_suc.groupby(['owner_id' ])['reg_date'].min().reset_index() single_ft = single_ft.rename(index=str, columns={'reg_date': 'first_time'}) df_suc = df_suc.merge(single_ft, on=['owner_id'], how='left') # print(df_suc[['owner_id', 'first_time', 'entry_date', 'reg_date']].head()) df_suc['time'] = pd.to_datetime(df_suc['first_time'], format='%Y-%m-%d %H:%M:%S') df_suc['reg_date'] = df_suc['reg_date'].astype(str) df_suc['day'] = df_suc['time'].apply(lambda x: x.dayofweek) df_suc['is_weekend'] = df_suc['day'].apply(lambda x: 1 if x == 0 or x == 6 else 0) us_date, us_num = day_actitve_num_print( df_suc[df_suc['is_weekend'] == 1]) self.__params['US_date'] = us_date self.__params['US_num'] = us_num # print(self.__params) params = {} params['US_date'] = self.__params['US_date'] params['US_num'] = self.__params['US_num'] self.__data = params def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module0.txt') self.__params = {} for param in self.__templete.get_params(): self.__params[param] = '' def run(self, df): pass def maketext(self): return self.__templete.format_templet(self.__params) def makedata(self): # js? db? whatever. return ''
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module0.txt') self.__params = {} self.__data = {} self.name = "module_usertimes" def run(self, df, global_params=None): if global_params is None: global_params = {} user_counts = df[df.order_status == 5].groupby( 'owner_id').order_no.count().reset_index().rename( columns={'order_no': 'user_counts'}) user_counts['user_counts_level'] = user_counts.user_counts.map( lambda x: '1次' if x == 1 else '2-5次' if x <= 5 else '6-20次' if x <= 20 else '20次以上') user_counts = user_counts.groupby( 'user_counts_level').owner_id.count().to_dict() # 填数据 self.__data['levels'] = list(user_counts.keys()) self.__data['user_times'] = user_counts counts_sum = sum(user_counts.values()) self.__data['user_percent'] = { each: user_counts[each] / counts_sum for each in user_counts } def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module0.txt') self.__params = {} self.__data = {} self.name = "module_newuseranalize" def run(self, df, global_params=None): if global_params is None: global_params = {} # 每个用户第一次使用作为新增 user_reg_date = df.groupby('owner_id').reg_date.min().reset_index().rename( columns={'reg_date': 'user_reg_date'}) # 计数, 每天新增人数 user_reg_date['user_reg_day'] = user_reg_date.user_reg_date.map(lambda x:str(x)[:10]) new_user_day_count = user_reg_date['user_reg_day'].value_counts().sort_index() # 填数据 self.__data['new_user_day_count'] = [new_user_day_count.index.tolist(), new_user_day_count.tolist()] def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder) # return ''
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module1.txt') self.__params = {} def run(self, df): # STATUS ==5 的是交易成功的 df_suc = df[df['ORDER_STATUS'] == 5] tmp = df_suc.groupby(['START_NAME', 'END_NAME' ]).SINGLE_TICKET_NUM.sum().reset_index() starts = tmp.groupby('START_NAME').SINGLE_TICKET_NUM.sum().sort_values( ascending=False) ends = tmp.groupby('END_NAME').SINGLE_TICKET_NUM.sum().sort_values( ascending=False) self.__params['start_top10'] = starts[:10].index.tolist() self.__params['end_top10'] = ends[:10].index.tolist() self.__params['start_top10_percent'] = starts[:10].sum() / starts.sum( ) * 100 self.__params['end_top10_precent'] = ends[:10].sum() / ends.sum() * 100 def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): return ''
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_peopleflow.txt') self.__params = {} self.name = "module_peopleflow" def run(self, df, global_params=None): # STATUS ==5 的是交易成功的 df_suc = df[df['order_status'] == 5].copy() df_suc['time'] = pd.to_datetime(df_suc['entry_date'], format='%Y-%m-%d %H:%M:%S') df_suc['hour'] = df_suc['time'].apply(lambda x: x.hour) tmp = df_suc.groupby(['entry_station', 'hour']).ticket_num.sum().reset_index() starts = df_suc.groupby('entry_station').ticket_num.sum().sort_values( ascending=False) hour = tmp[tmp.entry_station == starts.index[0]][tmp.ticket_num > ( tmp[tmp.entry_station == starts.index[0]].ticket_num.mean()) * 1.3] # print(hour.head()) self.__params['M3_stations'] = starts[:3].index.tolist() self.__params['M3_station0_t1'] = hour.hour.min() self.__params['M3_station0_t2'] = hour.hour.max() # print(self.__params) params = {} params['M3_stations'] = self.__params['M3_stations'] params['M3_stations_trend1_time'] = tmp[tmp.entry_station == self.__params['M3_stations'] [0]].hour.tolist() params['M3_stations_trend1'] = tmp[tmp.entry_station == self.__params['M3_stations'] [0]].ticket_num.tolist() params['M3_stations_trend2_time'] = tmp[tmp.entry_station == self.__params['M3_stations'] [1]].hour.tolist() params['M3_stations_trend2'] = tmp[tmp.entry_station == self.__params['M3_stations'] [1]].ticket_num.tolist() params['M3_stations_trend3_time'] = tmp[tmp.entry_station == self.__params['M3_stations'] [2]].hour.tolist() params['M3_stations_trend3'] = tmp[tmp.entry_station == self.__params['M3_stations'] [2]].ticket_num.tolist() self.__data = params global_params['M3_station0_t1'] = self.__params['M3_station0_t1'] global_params['M3_station0_t2'] = self.__params['M3_station0_t2'] for i in range(3): global_params['M3_stations[%d]' % i] = self.__params['M3_stations'][i] def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_hotstation.txt') self.__params = {} self.name = "module_hotstation" def run(self, df, global_params=None): if global_params is None: global_params = {} # STATUS ==5 的是交易成功的 df_suc = df[df['order_status'] == 5].copy() df_suc['entry_date'] = df_suc['entry_date'].astype(str) df_suc['date'] = df_suc['entry_date'].apply(lambda x: x[0:10]) # df_suc['time'] = pd.to_datetime(df_suc['entry_date'], format='%Y-%m-%d %H:%M:%S') tmp = df_suc.groupby(['entry_station', 'exit_station']).ticket_num.sum().reset_index() starts = tmp.groupby('entry_station').ticket_num.sum().sort_values( ascending=False) ends = tmp.groupby('exit_station').ticket_num.sum().sort_values( ascending=False) station = pd.DataFrame() station['st_name'] = list( set(list(df_suc['entry_station']) + list(df_suc['exit_station']))) starts = pd.DataFrame(starts) starts = starts.reset_index() ends = pd.DataFrame(ends) ends = ends.reset_index() station = station.rename(index=str, columns={'st_name': 'entry_station'}) # print(station.head()) station = station.merge(starts, on=['entry_station'], how='left') station = station.rename(index=str, columns={'entry_station': 'exit_station'}) station = station.merge(ends, on=['exit_station'], how='left') station['total_ticket'] = station.ticket_num_x + station.ticket_num_y station = station.sort_values(by=['total_ticket'], ascending=False) # print(station.head()) trend = df_suc.groupby(['entry_station', 'date']).ticket_num.sum().reset_index() # print(trend.head()) routes_groupby = df_suc.groupby([ 'entry_station', 'exit_station' ]).ticket_num.sum().sort_values(ascending=False).index.tolist()[:10] routes = reduce(operator.add, routes_groupby) routes = sorted(dict(Counter(routes)).items(), key=lambda x: x[1], reverse=True)[:10] self.__params['M2_hotstations'] = station[ station.total_ticket > station.total_ticket.mean()].exit_station.tolist() self.__params['M2_hotroutes'] = routes_groupby self.__params['M2_hotroutes_topstations'] = [ route[0] for route in routes ][:5] # print(self.__params) params = {} params['M2_hotstations'] = self.__params['M2_hotstations'] params['M2_hotstations_ticketnum'] = station[ station.total_ticket > station.total_ticket.mean()].total_ticket.tolist() params['M2_hotstations_trend1_time'] = trend[ trend.entry_station == self.__params['M2_hotstations'] [0]].date.tolist() params['M2_hotstations_trend1'] = trend[trend.entry_station == self.__params['M2_hotstations'] [0]].ticket_num.tolist() params['M2_hotstations_trend2_time'] = trend[ trend.entry_station == self.__params['M2_hotstations'] [1]].date.tolist() params['M2_hotstations_trend2'] = trend[trend.entry_station == self.__params['M2_hotstations'] [1]].ticket_num.tolist() params['M2_hotstations_trend3_time'] = trend[ trend.entry_station == self.__params['M2_hotstations'] [2]].date.tolist() params['M2_hotstations_trend3'] = trend[trend.entry_station == self.__params['M2_hotstations'] [2]].ticket_num.tolist() params['M2_hotroutes'] = routes_groupby params['M2_hotroutes_ticketnum'] = df_suc.groupby([ 'entry_station', 'exit_station' ]).ticket_num.sum().sort_values(ascending=False).tolist()[:10] self.__data = params global_params['M2_hotstations'] = '、'.join(params['M2_hotstations']) for i in range(3): global_params['M2_top%d' % (i + 1)] = params['M2_hotstations'][i] for i in range(3): global_params['M2_hotroutes'] = '、'.join( '-'.join(each) for each in params['M2_hotroutes'][:2]) global_params['M2_hotroutes_topstations'] = '、'.join( self.__params['M2_hotroutes_topstations'][:3]) def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_inout_analize.txt') self.__city_dict = {'广州': ['机场南', '广州东站'], } self.__params = {} self.__data = {} self.name = "module_inout_analize" def run(self, df, global_params=None): if global_params is None: global_params = {} df_suc = df[df.order_status == 5][['ticket_num', 'entry_date', 'entry_station', 'exit_station']] df_suc['weekday'] = df_suc.entry_date.map(lambda x: x.weekday()) df_suc['date'] = df_suc.entry_date.map(lambda x: str(x)[:10]) stations = self.__city_dict[global_params.get('city', '广州')] # 日进出站人数 entry_day_nums = {} exit_day_nums = {} # # 日进出站排名 # entry_ranks = {} # exit_ranks = {} # 周末去向分布 weekend_exits = {} for station in stations: # 日进出站人数 entry_day_count = df_suc[df_suc.entry_station == station].groupby('date').ticket_num.sum().sort_index() exit_day_count = df_suc[df_suc.exit_station == station].groupby('date').ticket_num.sum().sort_index() # # 日进出站排名 # entry_rank = df_suc[df_suc.entry_station == station].groupby('entry_station').ticket_num.sum().sort_values( # ascending=False).head(20) # exit_rank = df_suc[df_suc.exit_station == station].groupby('exit_station').ticket_num.sum().sort_values( # ascending=False).head(20) # 各站点客流去向,放进字典 entry_tmp = df_suc[df_suc.entry_station == station].groupby('exit_station').ticket_num.sum().sort_values( ascending=False).head(20) entry_day_nums[station] = entry_day_count exit_day_nums[station] = exit_day_count # entry_ranks[station] = entry_rank # exit_ranks[station] = exit_rank weekend_exits[station] = entry_tmp # 填参数 self.__params['stations'] = stations self.__params['stations_all'] = ','.join(stations) for i in range(len(stations)): self.__params['st_%d_wk_top3' % i] = '、'.join(weekend_exits[stations[i]].head(3).index.tolist()) # 填数据 self.__data['stations'] = stations self.__data['entry_nums'] = {station: [entry_day_nums[station].index.tolist(), entry_day_nums[station].tolist()] for station in stations} self.__data['exit_nums'] = {station: [exit_day_nums[station].index.tolist(), exit_day_nums[station].tolist()] for station in stations} # self.__data['entry_ranks'] = {station:[entry_ranks[station].index.tolist(), entry_ranks[station].tolist()] for station in stations} # self.__data['exit_ranks'] = self.__data['weekend_exits'] = { station: [weekend_exits[station].index.tolist(), weekend_exits[station].tolist()] for station in stations} global_params['stations_all'] = '、'.join(stations) for i in range(len(stations)): global_params['stations[%d]'%i] = stations[i] global_params['st_%d_wk_top3'%i] = '、'.join(self.__data['weekend_exits'][stations[i]][0][:3]) def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_jamanalize.txt') self.__params = {} self.__data = {} self.name = "module_jamanalize" self.__time_period = [(7, 9), (17, 19)] # 长度必须为2,代表早晚上班时段 def run(self, df, global_params=None): import pickle as pk import pandas as pd import os if global_params is None: global_params = {} # 用最短路径作为预测 city = global_params.get('city', '广州') if not os.path.exists('routes/%s_route.pk' % city): return routes = pk.load(open('routes/%s_route.pk' % city, 'rb')) # 生成(工作日、周末) x (上班时段、下班时段) 路径人数并排序 df_suc = df[df.order_status == 5] df_suc['weekday'] = df_suc.entry_date.map(lambda x: x.weekday()) df_suc['hour'] = df_suc.entry_date.map(lambda x: x.hour) morning = self.__time_period[0] evening = self.__time_period[1] df_workday_morning = df_suc[ (df_suc.weekday < 5) & (df_suc.hour >= morning[0]) & (df_suc.hour <= morning[1])].groupby( ['entry_station', 'exit_station']).ticket_num.sum().reset_index().values df_workday_evening = df_suc[ (df_suc.weekday < 5) & (df_suc.hour >= evening[0]) & (df_suc.hour <= evening[1])].groupby( ['entry_station', 'exit_station']).ticket_num.sum().reset_index().values df_holiday_morning = df_suc[ (df_suc.weekday >= 5) & (df_suc.hour >= morning[0]) & (df_suc.hour <= morning[1])].groupby( ['entry_station', 'exit_station']).ticket_num.sum().reset_index().values df_holiday_evening = df_suc[ (df_suc.weekday >= 5) & (df_suc.hour >= evening[0]) & (df_suc.hour <= evening[1])].groupby( ['entry_station', 'exit_station']).ticket_num.sum().reset_index().values error_counts = 0 try: for d in ['workday', 'holiday']: for t in ['morning', 'evening']: locals()['%s_%s_res' % (d, t)] = {} for row in locals()['df_%s_%s' % (d, t)]: if row[0] == row[1]: continue for path in routes[(row[0], row[1])]: locals()['%s_%s_res' % (d, t)][path] = locals()['%s_%s_res' % (d, t)].get(path, 0) + row[2] del locals()['df_%s_%s' % (d, t)] tmp = locals()['%s_%s_res' % (d, t)] locals()['%s_%s_res' % (d, t)] = pd.DataFrame( [{'start': each[0], 'end': each[1], 'line':each[2],'fluency': tmp[each]} for each in tmp]).sort_values('fluency',ascending=False) locals()['%s_%s_res'% (d, t)]['level'] = pd.cut(locals()['%s_%s_res'%(d, t)]['fluency'], 5, labels=False) self.__params['%s_%s_jam_routes'% (d, t)] = '\n'.join([','.join(each) for each in locals()['%s_%s_res'% (d, t)][locals()['%s_%s_res'% (d, t)].level==4][['start','end','line','fluency']].astype(str).values]) del tmp for jam_lv in range(0,5): if jam_lv == 0: self.__data['%s_%s_jam_routes'%(d, t)] = {} self.__data['%s_%s_jam_routes' % (d, t)][jam_lv] = locals()['%s_%s_res'% (d, t)][locals()['%s_%s_res'% (d, t)].level==jam_lv][['start', 'end', 'level']].as_matrix().tolist() global_params['M7_AM_busy_routes'] = '、'.join('-'.join([str(x) for x in each[:2]]) for each in self.__data['workday_morning_jam_routes'][4][:3]) except: error_counts += 1 print('----Jam Analize Finished. %d error meet.'%error_counts) global_params['M7_AM_lv4_count'] = len(self.__data['workday_morning_jam_routes'][4]) global_params['M7_AM_lv4_rate'] = '%.2f'%(global_params['M7_AM_lv4_count'] / len(routes) * 100) global_params['M7_PM_busy_routes'] = '、'.join('-'.join([str(x) for x in each[:2]]) for each in self.__data['workday_evening_jam_routes'][4][:3]) global_params['M7_PM_lv4_count'] = len(self.__data['workday_evening_jam_routes'][4]) global_params['M7_PM_lv4_rate'] = '%.2f'%(global_params['M7_PM_lv4_count'] / len(routes) * 100) def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder) # return ''
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_workholi_cmp.txt') self.__city_dict = { '广州': ['机场南', '广州东站'], } self.name = "module_workholi_cmp" self.__params = {} self.__data = {} def run(self, df, global_params=None): if global_params is None: global_params = {} df_suc = df[df.order_status == 5][[ 'ticket_num', 'entry_date', 'entry_station', 'exit_station' ]] df_suc['weekday'] = df_suc.entry_date.map(lambda x: x.weekday()) df_suc['hour'] = df_suc.entry_date.map(lambda x: x.hour) days = df_suc.entry_date.max() - df_suc.entry_date.min() days = days.round('d').days workday_seq = (df_suc[df_suc.weekday < 5].groupby( 'hour').ticket_num.sum().sort_index() / days).map(lambda x: round(x, 3)) holiday_seq = (df_suc[df_suc.weekday >= 5].groupby( 'hour').ticket_num.sum().sort_index() / days).map(lambda x: round(x, 3)) self.__params['workday_seq_maxhour'] = workday_seq.argmax() self.__params['workday_seq_maxfluency'] = int(workday_seq.max()) self.__params['holiday_seq_maxhour'] = holiday_seq.argmax() self.__params['holiday_seq_maxfluency'] = int(holiday_seq.max()) workday_seqs = {} holiday_seqs = {} for station in self.__city_dict[global_params.get('city', '广州')]: workday_seqs[station] = ( df_suc[(df_suc.entry_station == station) & (df_suc.weekday < 5)].groupby( 'hour').ticket_num.sum().sort_index() / days).map(lambda x: round(x, 3)) holiday_seqs[station] = ( df_suc[(df_suc.entry_station == station) & (df_suc.weekday >= 5)].groupby( 'hour').ticket_num.sum().sort_index() / days).map(lambda x: round(x, 3)) # 填数据 self.__data['workday_full_seq'] = [ workday_seq.index.tolist(), workday_seq.tolist() ] self.__data['holiday_full_seq'] = [ holiday_seq.index.tolist(), holiday_seq.tolist() ] self.__data['stations'] = self.__city_dict[global_params.get( 'city', '广州')] self.__data['station_workday_seqs'] = { station: [ workday_seqs[station].index.tolist(), workday_seqs[station].tolist() ] for station in self.__data['stations'] } self.__data['station_holiday_seqs'] = { station: [ holiday_seqs[station].index.tolist(), holiday_seqs[station].tolist() ] for station in self.__data['stations'] } # global_params['workday_seq_maxhour'] = self.__params[ 'workday_seq_maxhour'] global_params['workday_seq_maxfluency'] = self.__params[ 'workday_seq_maxfluency'] global_params['holiday_seq_maxhour'] = self.__params[ 'holiday_seq_maxhour'] global_params['holiday_seq_maxfluency'] = self.__params[ 'holiday_seq_maxfluency'] def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_ticketway.txt') self.__params = {} self.name = "module_ticketway" def run(self, df, global_params=None): # STATUS ==5 的是交易成功的 # payment_type source payment_type_dict = { 0: '支付宝', 1: '中移动', 2: '支付宝网上购票', 3: '微信支付', 4: '微信扫码支付', 5: '翼支付', 6: '支付宝网页支付', 7: '微信公众号支付', 8: '首信易支付', 9: '中移动WAP支付', 10: '银联支付', 11: '银联支付', 12: '微信小程序支付' } source_dict = { 1: '盘缠ios', 2: '盘缠android', 3: '插件ios', 4: '插件android', 5: 'h5公众号或扫码支付', 6: '非闪客蜂公众号', 7: '咖啡', 8: '长沙ios', 9: '长沙android' } df_suc = df[df['order_status'] == 5].copy() tmp = df_suc.groupby(['entry_station', 'exit_station']).ticket_num.sum().reset_index() starts = df_suc.groupby('entry_station').ticket_num.sum().sort_values( ascending=False) df_suc.payment_type = df_suc.payment_type.fillna('0') df_suc.payment_type = df_suc.payment_type.astype(int) df_suc.source = df_suc.source.fillna('5') df_suc.source = df_suc.source.astype(int) df_suc.payment_type = df_suc.payment_type.map(payment_type_dict) df_suc.source = df_suc.source.map(source_dict) df_suc = df_suc[df_suc.entry_station.isin(starts[:10].index.tolist())] total = df_suc.shape[0] type = df_suc.groupby( ['payment_type']).ticket_num.count().sort_values(ascending=False) source = df_suc.groupby( ['source']).ticket_num.count().sort_values(ascending=False) type = type / total source = source / total # print(type.head()) # print(source.head()) self.__params['M4_stations'] = starts[:10].index.tolist() self.__params['M4_top_methods'] = source.index.tolist() self.__params['M4_top_source'] = type.index.tolist() self.__params['M4_top_perc'] = source.tolist() self.__params['M4_top_perc_source'] = type.tolist() # print(self.__params) params = {} params['M4_top_methods'] = self.__params['M4_top_methods'] params['M4_top_perc'] = self.__params['M4_top_perc'] params['M4_top_source'] = self.__params['M4_top_source'] params['M4_top_perc_source'] = self.__params['M4_top_perc_source'] self.__data = params global_params['M4_stations'] = self.__params['M4_stations'] global_params['M4_top_methods[0]'] = self.__params['M4_top_methods'][0] global_params['M4_top_perc[0]'] = '%.2f' % ( self.__params['M4_top_perc'][0] * 100) if len(self.__params['M4_top_methods']) > 1: global_params['M4_top_methods[1]'] = self.__params[ 'M4_top_methods'][1] global_params['M4_top_perc[1]'] = '%.2f' % ( self.__params['M4_top_perc'][1] * 100) global_params['M4_top_source[0]'] = self.__params['M4_top_source'][0] global_params['M4_top_perc_source[0]'] = '%.2f' % ( self.__params['M4_top_perc_source'][0] * 100) if len(self.__params['M4_top_source']) > 1: global_params['M4_top_source[1]'] = self.__params['M4_top_source'][ 1] global_params['M4_top_perc_source[1]'] = '%.2f' % ( self.__params['M4_top_perc_source'][1] * 100) def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_details.txt') self.__params = {} self.name = "module_details" def run(self, df, global_params=None): # STATUS ==5 的是交易成功的 df_suc = df[df['order_status'] == 5].copy() df_suc = df_suc[~pd.isnull(df_suc.reg_date)] df_suc.reg_date = df_suc.reg_date.fillna(df_suc.reg_date.min()) df_suc['datetime'] = pd.to_datetime(df_suc['reg_date'], format='%Y-%m-%d %H:%M:%S') df_suc['year'] = df_suc['reg_date'].map(lambda x: x.year) df_suc['month'] = df_suc['reg_date'].map(lambda x: x.month) df_suc['day'] = df_suc['reg_date'].map(lambda x: x.day) df_suc['hour'] = df_suc['reg_date'].map(lambda x: x.hour) df_suc['reg_date'] = df_suc['reg_date'].astype(str) df_suc['reg_month'] = df_suc['reg_date'].map(lambda x: x[0:7]) df_suc['reg_day'] = df_suc['reg_date'].map(lambda x: int(x[8:10])) df_suc['dayofweek'] = df_suc['datetime'].apply(lambda x: x.dayofweek) single_ft = df_suc.groupby(['owner_id' ])['reg_date'].min().reset_index() single_ft = single_ft.rename(index=str, columns={'reg_date': 'first_time'}) df_suc = df_suc.merge(single_ft, on=['owner_id'], how='left') df_suc = df_suc[~pd.isnull(df_suc.first_time)] df_suc['first_time'] = df_suc['first_time'].astype(str) single_ft = df_suc.groupby('owner_id')['reg_date'].max().reset_index() single_ft = single_ft.rename(index=str, columns={'reg_date': 'last_time'}) df_suc = df_suc.merge(single_ft, on=['owner_id'], how='left') df_suc = df_suc[~pd.isnull(df_suc.last_time)] df_suc['last_time'] = df_suc['last_time'].astype(str) df_suc['first_month'] = df_suc['first_time'].map(lambda x: x[0:7]) df_suc['first_date_obj'] = df_suc['first_time'].map( lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date()) df_suc['reg_date_obj'] = df_suc['reg_date'].map( lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date()) df_suc['tmp'] = df_suc['reg_date_obj'] - df_suc['first_date_obj'] df_suc['day_interval'] = df_suc['tmp'].map(lambda x: x.days) df_suc.drop(['first_date_obj', 'reg_date_obj', 'tmp'], axis=1, inplace=True) params = {} params['D_new_cont_month_fm'] = {} mon_count = used_month_count(df_suc) fm = df_suc.reg_date.min()[0:7] if fm <= df_suc.reg_date.max()[0:7]: D_new_cont_month = [] D_new_cont_month_people = [] total_people = 0 # print(next_month(fm)) for i in range(month_diff(fm) - 1): D_new_cont_month.append(i + 1) people, total_people = used_next_month_count( df_suc, fm, i + 1, has_used_in_all_next_months) D_new_cont_month_people.append(people) D_new_cont_month_ratio = list( np.array(D_new_cont_month_people) / (total_people + 1)) params['D_new_cont_month_fm'][fm] = dict( zip(D_new_cont_month, D_new_cont_month_ratio)) # print(params) D_total_month = list(range(1, month_diff(fm))) tmp_df = df_suc.groupby(['owner_id']).reg_month.unique().reset_index() tmp_df['unique_month'] = tmp_df['reg_month'].apply(lambda x: len(x)) D_total_month_people = tmp_df.groupby( ['unique_month']).owner_id.count().reset_index().owner_id.tolist() D_total_month = D_total_month[:len(D_total_month_people)] D_total_month_ratio = list( np.array(D_total_month_people) / sum(D_total_month_people)) # user_used_days_qr: 用户所有使用过的天次,第一次使用为第0天 # user_continue_day_count_qr: 用户最长连续使用天数 # 可以由 user_used_days_qr -> (用户最后一次使用距第一次使用相隔多少天;用户使用频率,即使用的天数/相隔的天数) user_used_days_qr, user_continue_day_count_qr = longest_used_days( df_suc[['owner_id', 'day_interval']]) user_continue_day_count_num_qr = dict() # 最长连续使用天数的用户数量 for k in user_continue_day_count_qr.keys(): v = user_continue_day_count_qr[k] user_continue_day_count_num_qr[ v] = user_continue_day_count_num_qr.get(v, 0) + 1 continue_day_count_df_qr = pd.DataFrame() count_list = [] num_list = [] for k in user_continue_day_count_num_qr.keys(): count_list.append(k) num_list.append(user_continue_day_count_num_qr[k]) continue_day_count_df_qr = continue_day_count_df_qr.from_dict({ 'continue_day_count': count_list, 'num': num_list }) D_cont_day_people = [] D_cont_day_people.append( day_ratio(continue_day_count_df_qr, 1, 1, 'continue_day_count')) D_cont_day_people.append( day_ratio(continue_day_count_df_qr, 2, 3, 'continue_day_count')) D_cont_day_people.append( day_ratio(continue_day_count_df_qr, 4, 7, 'continue_day_count')) D_cont_day_people.append( day_ratio(continue_day_count_df_qr, 8, 20, 'continue_day_count')) D_cont_day_people.append( day_ratio(continue_day_count_df_qr, 21, 999, 'continue_day_count')) D_cont_day_ratio = list( np.array(D_cont_day_people) / len(df_suc.owner_id.unique())) user_day_count_num_qr = dict() # 总使用天数的用户数量 (最后一次-第一次) for k in user_used_days_qr.keys(): v = user_used_days_qr[k][-1] + 1 user_day_count_num_qr[v] = user_day_count_num_qr.get(v, 0) + 1 day_count_df_qr = pd.DataFrame() count_list = [] num_list = [] for k in user_day_count_num_qr.keys(): count_list.append(k) num_list.append(user_day_count_num_qr[k]) day_count_df_qr = day_count_df_qr.from_dict({ 'day_count': count_list, 'num': num_list }) D_total_day_people = [] D_total_day_people.append(day_ratio(day_count_df_qr, 1, 1, 'day_count')) D_total_day_people.append(day_ratio(day_count_df_qr, 2, 5, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 6, 20, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 21, 30, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 31, 60, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 61, 90, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 91, 120, 'day_count')) D_total_day_people.append( day_ratio(day_count_df_qr, 120, 999, 'day_count')) D_total_day_ratio = list( np.array(D_total_day_people) / len(df_suc.owner_id.unique())) user_ratio_num_qr = { 10: 0, 20: 0, 30: 0, 40: 0, 50: 0, 60: 0, 70: 0, 80: 0, 90: 0, 100: 0, 101: 0 } for k in user_used_days_qr.keys(): n = len(user_used_days_qr[k]) last_day = user_used_days_qr[k][-1] + 1 ratio = n / last_day if ratio <= 0.1: user_ratio_num_qr[10] = user_ratio_num_qr[10] + 1 elif ratio <= 0.2: user_ratio_num_qr[20] = user_ratio_num_qr[20] + 1 elif ratio <= 0.3: user_ratio_num_qr[30] = user_ratio_num_qr[30] + 1 elif ratio <= 0.4: user_ratio_num_qr[40] = user_ratio_num_qr[40] + 1 elif ratio <= 0.5: user_ratio_num_qr[50] = user_ratio_num_qr[50] + 1 elif ratio <= 0.6: user_ratio_num_qr[60] = user_ratio_num_qr[60] + 1 elif ratio <= 0.7: user_ratio_num_qr[70] = user_ratio_num_qr[70] + 1 elif ratio <= 0.8: user_ratio_num_qr[80] = user_ratio_num_qr[80] + 1 elif ratio <= 0.9: user_ratio_num_qr[90] = user_ratio_num_qr[90] + 1 elif n > 1: user_ratio_num_qr[100] = user_ratio_num_qr[100] + 1 else: user_ratio_num_qr[101] = user_ratio_num_qr[101] + 1 tmp_list = list(user_ratio_num_qr.values()) D_user_ratio_people = [tmp_list[-1]] + tmp_list[:-1] D_user_ratio_ratio = list( np.array(D_user_ratio_people) / len(df_suc.owner_id.unique())) user_used_month, user_used_mcount = user_months_and_count(df_suc) user_mw_dict = user_month_week_count(df_suc) user_ft_lt_dict = user_ft_lt(df_suc) user_ww_dict = user_work_week_num(df_suc) u_ids = [] u_mds = [] for user_id in df_suc['owner_id'].unique(): u_ids.append(user_id) u_mds.append( user_model(df_suc, user_id, user_used_month, user_used_mcount, user_mw_dict, user_ft_lt_dict, user_ww_dict)) user_model_df = pd.DataFrame() user_model_df = user_model_df.from_dict({ 'owner_id': u_ids, "model": u_mds }) D_model_people, D_model_ratio = user_model_count(user_model_df) self.__params['D_new_cont_month'] = D_new_cont_month self.__params['D_new_cont_month_people'] = D_new_cont_month_people self.__params['D_new_cont_month_ratio'] = D_new_cont_month_ratio self.__params['D_total_month'] = D_total_month self.__params['D_total_month_people'] = D_total_month_people self.__params['D_total_month_ratio'] = D_total_month_ratio self.__params['D_cont_day'] = [ '连续使用1天', '连续使用2~3天', '连续使用4~7天', '连续使用8~20天', '连续使用21天以上' ] self.__params['D_cont_day_people'] = D_cont_day_people self.__params['D_cont_day_ratio'] = D_cont_day_ratio self.__params['D_total_day'] = [ '总使用1天', '总使用2~5天', '总使用6~20天', '总使用21~30天', '总使用31~60天', '总使用61~90天', '总使用91~120天', '总使用121天以上' ] self.__params['D_total_day_people'] = D_total_day_people self.__params['D_total_day_ratio'] = D_total_day_ratio self.__params['D_user_pre'] = [ '只使用1次', '0~0.1', '0.1~0.2', '0.2~0.3', '0.3~0.4', '0.4~0.5', '0.5~0.6', '0.6~0.7', '0.7~0.8', '0.8~0.9', '0.9~1' ] self.__params['D_user_pre_people'] = D_user_ratio_people self.__params['D_user_pre_ratio'] = D_user_ratio_ratio self.__params['D_model'] = [ '稳定高频周末型', '稳定高频工作型', '稳定高频常用型', '稳定低频周末型', '稳定低频工作型', '稳定低频常用型', '一段时间之后不用型', '突发周末型', '突发工作型', '突发常用型', '本月新用户周末型', '本月新用户工作型', '本月新用户常用型' ] self.__params['D_model_people'] = D_model_people self.__params['D_model_ratio'] = D_model_ratio # print(self.__params) for k, v in self.__params.items(): if 'people' not in k and 'ratio' not in k: params[k] = {} for i, vv in enumerate(v): params[k][vv] = self.__params[k + '_ratio'][i] params['D_model_people'] = dict( zip(self.__params['D_model'], self.__params['D_model_people'])) self.__data = params global_params['day_num'] = 30 global_params['month_num'] = 3 def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)
class Module(object): def __init__(self): from common.TempletLoader import TempletLoader self.__templete = TempletLoader('templets/module_dataanalize.txt') self.name = "module_dataanalize" self.__params = {} self.__data = {} def run(self, df, global_params=None): if global_params is None: global_params = {} # 订单状态分布 order_status = df.order_status.value_counts().to_dict() order_status_tk = df.groupby('order_status').ticket_num.sum().to_dict() # print(order_status) # 订单总数 order_nums = df.shape[0] # 出票订单数、购票但未取票订单数、失效订单数 order_get_tk = order_status.get(5, 0) order_notpay = order_status.get(1, 0) order_pay_notuse = order_status.get(2, 0) order_canceled = order_status.get(3, 0) + order_status.get( 6, 0) + order_status.get(7, 0) # 出票数、订单总票数 tk_get_num = order_status_tk.get(5, 0) tk_get_sum = sum(order_status_tk.values()) # 站点名称数、站点数量、编号数量 # 有过购票行为的站点数,站点编码数 st_num = len( set(df.entry_station.unique()) | set(df.exit_station.unique())) st_code_num = len( set(df.entry_station_code.unique()) | set(df.exit_station_code.unique())) # 填充至变量中! # order_num, order_use_num, order_pay_not_use_num, order_fail_num, ticket_num, ticket_use_num self.__params['order_num'] = order_nums self.__params['order_use_num'] = order_get_tk self.__params['order_notpay'] = order_notpay self.__params['order_pay_not_use_num'] = order_pay_notuse self.__params['order_fail_num'] = order_canceled self.__params['ticket_num'] = tk_get_sum self.__params['ticket_use_num'] = tk_get_num self.__data['order_num'] = order_nums self.__data['order_use_num'] = order_get_tk self.__data['order_notpay'] = order_notpay self.__data['order_pay_not_use_num'] = order_pay_notuse self.__data['order_fail_num'] = order_canceled self.__data['ticket_num'] = tk_get_sum self.__data['ticket_use_num'] = tk_get_num def maketext(self, global_params=None): # 允许传入全局变量, 但局部变量的优先级更高 if global_params and type(global_params) == dict: for param in global_params: if param not in self.__params: self.__params[param] = global_params[param] # 如果有缺失的变量, 填空字符串 for param in self.__templete.get_params(): if param not in self.__params: self.__params[param] = '' # 返回format结果 return self.__templete.format_templet(self.__params) def makedata(self): # for each in self.__data: # print(each, self.__data[each], self.__data[each].dtype) import json from common.MyEncoder import MyEncoder return json.dumps(dict(self.__data), ensure_ascii=False, cls=MyEncoder)