for user_id in collection_temp: if user_id not in target_user_set: continue if not start_date.has_key(user_id): start_date[user_id]=date_str if not end_date.has_key(user_id): end_date[user_id]=date_str elif end_date[user_id]<date_str: end_date[user_id]=date_str #calculate life length user_life_length={} for user_id in start_date.keys(): user_life_length[user_id]=helper_regex.get_day_diff_from_date_str(end_date[user_id],start_date[user_id])+1 #calculate dispersion user_life_length_dispersion={} for user_id,length in user_life_length.iteritems(): if not user_life_length_dispersion.has_key(length): user_life_length_dispersion[length]=0 user_life_length_dispersion[length]+=1 #calculate churn rate churn_rate_result={4:0,7:0,14:0,21:0,28:0,35:0,42:0,49:0,56:0} day_levels=churn_rate_result.keys() day_levels.sort()
def calculate_count_distinct_named_collection(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False): #date_unit accepts 1,2,3,...,'weekly','monthly' #for weekly, it produces result only when date is Sunday, else 0 #for monthly, it produces result only when date is the last day of a week, else 0 #for all cases, it doesn't produce value when required collections are not all ready unique=0 total=0 average=0 if not date: return unique,total,average if date_unit=='weekly': if helper_regex.get_weekday_from_date_str(date)!=7: return unique,total,average date_unit=7 elif date_unit=='monthly': if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01': return unique,total,average first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01' date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1 if date_unit<1: date_unit=1 key=key.replace('_collection_id','') sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name) #print sql collection_base_dict=helper_mysql.fetch_dict(sql) #print collection_base_dict """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) base_total=0 for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 #col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_temp=helper_collection.get_named_collection(table_name=table_name,oem_name=oem_name,category=category, \ key=key,sub_key=sub_key,date=date_temp) col_1 |= col_temp base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0 if col_id_temp==0: #force return null when data not complete if allow_collection_empty: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.' else: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.' return unique,total,average unique=len(col_1) total=base_total average=base_total*1.0/unique if unique>0 else 0 return unique,total,average
def _do_calculation(self): #generate new user set temp_new_user_dict={} temp_date=self.begin_date temp_end_date=self.end_date while True: if temp_date>temp_end_date: break temp_daily_new_user_set=self._get_daily_created_user_dict(temp_date) for user_id in temp_daily_new_user_set: temp_new_user_dict[user_id]=temp_date temp_date=helper_regex.date_add(temp_date,1) print len(temp_new_user_dict) print temp_new_user_dict #exit() #generate user active hitory self.temp_new_user_active_history_dict={} temp_date=self.begin_date temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1) while True: if temp_date>temp_end_date: break temp_active_user_set=self._get_daily_active_user_set(temp_date) for user_id in temp_active_user_set: if temp_new_user_dict.has_key(user_id): self.temp_new_user_active_history_dict.setdefault(user_id,[]) if temp_date<=helper_regex.date_add(temp_new_user_dict[user_id],self.observation_day_length-1): self.temp_new_user_active_history_dict[user_id].append(temp_date) temp_date=helper_regex.date_add(temp_date,1) print len(self.temp_new_user_active_history_dict) print self.temp_new_user_active_history_dict #exit() #generate user action history temp_new_user_action_history_dict={} temp_date=self.begin_date temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1) while True: if temp_date>temp_end_date: break temp_user_action_dict=self._get_daily_user_action(temp_date) temp_user_action_dict=dict((k,v) for k,v in temp_user_action_dict.iteritems() if k in temp_new_user_dict) temp_new_user_action_history_dict[temp_date]=temp_user_action_dict temp_date=helper_regex.date_add(temp_date,1) print len(temp_new_user_action_history_dict) print temp_new_user_action_history_dict #exit() #generate user group temp_user_group={} for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step): temp_user_group[user_group_by_max_active_time]=set([user_id for user_id,history \ in self.temp_new_user_active_history_dict.iteritems() \ if len(history)>=user_group_by_max_active_time \ and len(history)<user_group_by_max_active_time \ +self.observation_day_step]) print len(temp_user_group) print temp_user_group #exit() #generate evolution matrix temp_matrix_of_user_action={} # Dimension-1:time Dimension-2:user groups temp_matrix_of_user_online_day={} for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step): for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step): temp_matrix_of_user_action.setdefault(active_time_period,{}) temp_matrix_of_user_action[active_time_period].setdefault(user_group_by_max_active_time,0) temp_matrix_of_user_online_day.setdefault(active_time_period,{}) temp_matrix_of_user_online_day[active_time_period].setdefault(user_group_by_max_active_time,0) total_action=0 total_online_day=0 temp_user_set=temp_user_group[user_group_by_max_active_time] for user_id in temp_user_set: """ dates=self.temp_new_user_active_history_dict[user_id][active_time_period-1:min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id]))] total_online_day+=len(dates) for d in dates: if temp_new_user_action_history_dict[d].has_key(user_id): total_action+=temp_new_user_action_history_dict[d][user_id] """ if active_time_period>len(self.temp_new_user_active_history_dict[user_id]): continue dates=self.temp_new_user_active_history_dict[user_id] temp_begin_date=temp_new_user_dict[user_id] \ if active_time_period==1 \ else helper_regex.date_add(dates[active_time_period-1-1],1) # include those actions happend when user is offline temp_end_date=dates[-1] \ if active_time_period+self.observation_day_step-1>len(self.temp_new_user_active_history_dict[user_id]) \ else dates[active_time_period+self.observation_day_step-1-1] for temp_d in helper_regex.date_iterator(temp_begin_date,temp_end_date): if temp_new_user_action_history_dict[temp_d].has_key(user_id): total_action+=temp_new_user_action_history_dict[temp_d][user_id] total_online_day+=min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id])) \ -(active_time_period-1) temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time]=total_action temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]=total_online_day print temp_matrix_of_user_action print temp_matrix_of_user_online_day #export result analysis_factor_name=self.analysis_factor_name #'Mutual Friend Relation' table_name=self.table_name #'raw_data_test' oem_name=self.oem_name #'Shabik_360' category=self.category #'evolution_analysis' key_prefix=analysis_factor_name.lower().replace(' ','_')+'_ol%s,n%s,s%s_evolution_' % (self.observation_day_length,1+helper_regex.get_day_diff_from_date_str(self.end_date,self.begin_date),self.observation_day_step) date=self.begin_date view_name='Report %s Evolution Analysis %s (%s Days Step, %s Days)' % (oem_name,analysis_factor_name,self.observation_day_step, self.observation_day_length) view_description=r''' Date Range of Observed User: %s to %s Total Observed New User: %s Observing Users' First %s Days ''' % (self.begin_date,self.end_date,len(self.temp_new_user_active_history_dict),self.observation_day_length) for active_time_period,v in temp_matrix_of_user_action.iteritems(): for user_group_by_max_active_time,total_action in v.iteritems(): helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time], \ date=date,table_name=table_name) helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=len(temp_user_group[user_group_by_max_active_time]), \ date=date,table_name=table_name) adjusted_base=0 if temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]>0: adjusted_base=1.0*self.observation_day_step \ *temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time] \ /temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time] helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base_adjusted', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=adjusted_base,date=date,table_name=table_name) helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'total', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=len(self.temp_new_user_active_history_dict), \ date=date,table_name=table_name) # generate view sql sql_template=r''' SELECT concat( 'Online for ' ,replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g',''), 'd-' ,lpad(replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g','')+%(observation_day_step)s-1,2,'0') ,'d') as `Group Name` ,max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) as `Group Size` ,concat(format(100.0 *max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) /max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal',`value`,0)) ,2),'%%%%') as `Group Proportion` %%(column_sql)s FROM `%(table_name)s` WHERE ( `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal' )and date='%(date)s' GROUP BY `Group Name` ORDER BY `Group Name` DESC ''' % { 'observation_day_step':self.observation_day_step, 'oem_name':oem_name, 'category':category, 'key_prefix':key_prefix, 'table_name':table_name, 'date':date, } sql_column_template=r''' ,case when max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0))>0 then max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0)) else '-' end as `Avg %(analysis_factor_name)s Created in [%(active_time_period)sd,%(active_time_period_end)sd]` ''' sql_columns=[] for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step): temp_sql_column=sql_column_template % { 'observation_day_step':self.observation_day_step, 'oem_name':oem_name, 'category':category, 'key_prefix':key_prefix, 'table_name':table_name, 'date':date, 'active_time_period':str(active_time_period).zfill(2), 'active_time_period_end':str(active_time_period+self.observation_day_step-1).zfill(2), 'analysis_factor_name':analysis_factor_name, } sql_columns.append(temp_sql_column) view_sql=sql_template % { 'column_sql':'\n'.join(sql_columns) } print view_sql import helper_view helper_view.replace_view(view_name=view_name,view_sql=view_sql,view_description=view_description,charting_javascript=r''' add_highcharts_basic_line_chart({ 'tab_name':'Trend Comparison', 'column_names_pattern':/Online /ig, 'marginRight':300, 'reverse_key_column':false, 'reverse_table':true, 'exclude_rows':['Group Size','Group Proportion'], 'reverse_column':true }); ''') helper_view.grant_view(view_name,'5') helper_view.grant_view(view_name,'17') pass