for user_id in collection_temp:  
        if user_id not in target_user_set:
            continue
        if not start_date.has_key(user_id):
            start_date[user_id]=date_str
        if not end_date.has_key(user_id):
            end_date[user_id]=date_str
        elif end_date[user_id]<date_str:
            end_date[user_id]=date_str

#calculate life length

user_life_length={}

for user_id in start_date.keys():
    user_life_length[user_id]=helper_regex.get_day_diff_from_date_str(end_date[user_id],start_date[user_id])+1

#calculate dispersion

user_life_length_dispersion={}

for user_id,length in user_life_length.iteritems():
    if not user_life_length_dispersion.has_key(length):
        user_life_length_dispersion[length]=0
    user_life_length_dispersion[length]+=1

#calculate churn rate

churn_rate_result={4:0,7:0,14:0,21:0,28:0,35:0,42:0,49:0,56:0}
day_levels=churn_rate_result.keys()
day_levels.sort()
Beispiel #2
0
def calculate_count_distinct_named_collection(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False):
    
    #date_unit accepts 1,2,3,...,'weekly','monthly'
    #for weekly, it produces result only when date is Sunday, else 0
    #for monthly, it produces result only when date is the last day of a week, else 0
    #for all cases, it doesn't produce value when required collections are not all ready

    unique=0
    total=0
    average=0

    if not date:
        return unique,total,average

    if date_unit=='weekly':
        if helper_regex.get_weekday_from_date_str(date)!=7:
            return unique,total,average
        date_unit=7

    elif date_unit=='monthly':
        if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01':
            return unique,total,average

        first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01'
        date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1

    if date_unit<1:
        date_unit=1        

    key=key.replace('_collection_id','')
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)

    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)


    sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name)
    
    #print sql
    collection_base_dict=helper_mysql.fetch_dict(sql)
    #print collection_base_dict

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    base_total=0
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)

        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        #col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)

        col_temp=helper_collection.get_named_collection(table_name=table_name,oem_name=oem_name,category=category, \
                                                        key=key,sub_key=sub_key,date=date_temp)
        col_1 |= col_temp

        base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0
        
        if col_id_temp==0: #force return null when data not complete
            if allow_collection_empty:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.'
            else:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.'
                return unique,total,average

    unique=len(col_1)
    total=base_total
    average=base_total*1.0/unique if unique>0 else 0

    return unique,total,average
    def _do_calculation(self):
        
        #generate new user set

        temp_new_user_dict={}

        temp_date=self.begin_date
        temp_end_date=self.end_date

        while True:
            if temp_date>temp_end_date:
                break
            temp_daily_new_user_set=self._get_daily_created_user_dict(temp_date)

            for user_id in temp_daily_new_user_set:
                temp_new_user_dict[user_id]=temp_date
            
            temp_date=helper_regex.date_add(temp_date,1)
            
        print len(temp_new_user_dict)
        print temp_new_user_dict
        #exit()

        #generate user active hitory

        self.temp_new_user_active_history_dict={}

        temp_date=self.begin_date
        temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1)

        while True:
            if temp_date>temp_end_date:
                break
            temp_active_user_set=self._get_daily_active_user_set(temp_date)
            for user_id in temp_active_user_set:
                if temp_new_user_dict.has_key(user_id):
                    self.temp_new_user_active_history_dict.setdefault(user_id,[])
                    if temp_date<=helper_regex.date_add(temp_new_user_dict[user_id],self.observation_day_length-1):
                        self.temp_new_user_active_history_dict[user_id].append(temp_date)
            
            temp_date=helper_regex.date_add(temp_date,1)
        
        print len(self.temp_new_user_active_history_dict)
        print self.temp_new_user_active_history_dict
        #exit()

        #generate user action history

        temp_new_user_action_history_dict={}

        temp_date=self.begin_date
        temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1)

        while True:
            if temp_date>temp_end_date:
                break
            temp_user_action_dict=self._get_daily_user_action(temp_date)
            temp_user_action_dict=dict((k,v) for k,v in temp_user_action_dict.iteritems() if k in temp_new_user_dict)
            temp_new_user_action_history_dict[temp_date]=temp_user_action_dict
            
            temp_date=helper_regex.date_add(temp_date,1)
        
        print len(temp_new_user_action_history_dict)
        print temp_new_user_action_history_dict
        #exit()

        #generate user group

        temp_user_group={}
        for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step):
            temp_user_group[user_group_by_max_active_time]=set([user_id for user_id,history \
                                                            in self.temp_new_user_active_history_dict.iteritems() \
                                                            if len(history)>=user_group_by_max_active_time \
                                                            and len(history)<user_group_by_max_active_time \
                                                            +self.observation_day_step])

        print len(temp_user_group)
        print temp_user_group
        #exit()
        
        #generate evolution matrix

        temp_matrix_of_user_action={}                              #   Dimension-1:time    Dimension-2:user groups
        temp_matrix_of_user_online_day={}

        for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step):
            for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step):
                
                temp_matrix_of_user_action.setdefault(active_time_period,{})
                temp_matrix_of_user_action[active_time_period].setdefault(user_group_by_max_active_time,0)
                
                temp_matrix_of_user_online_day.setdefault(active_time_period,{})
                temp_matrix_of_user_online_day[active_time_period].setdefault(user_group_by_max_active_time,0)

                total_action=0
                total_online_day=0

                temp_user_set=temp_user_group[user_group_by_max_active_time]

                for user_id in temp_user_set:
                    """
                    dates=self.temp_new_user_active_history_dict[user_id][active_time_period-1:min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id]))]
                    total_online_day+=len(dates)
                    
                    for d in dates:
                        if temp_new_user_action_history_dict[d].has_key(user_id):
                            total_action+=temp_new_user_action_history_dict[d][user_id]
                    """

                    if active_time_period>len(self.temp_new_user_active_history_dict[user_id]):
                        continue

                    dates=self.temp_new_user_active_history_dict[user_id]
                    
                    temp_begin_date=temp_new_user_dict[user_id] \
                                    if active_time_period==1 \
                                    else helper_regex.date_add(dates[active_time_period-1-1],1) # include those actions happend when user is offline

                    temp_end_date=dates[-1] \
                                  if active_time_period+self.observation_day_step-1>len(self.temp_new_user_active_history_dict[user_id]) \
                                  else dates[active_time_period+self.observation_day_step-1-1]
                    
                    for temp_d in helper_regex.date_iterator(temp_begin_date,temp_end_date):
                        if temp_new_user_action_history_dict[temp_d].has_key(user_id):
                            total_action+=temp_new_user_action_history_dict[temp_d][user_id]                        

                    total_online_day+=min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id])) \
                                      -(active_time_period-1)

                temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time]=total_action    
                temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]=total_online_day    

        print temp_matrix_of_user_action
        print temp_matrix_of_user_online_day
            
        #export result

        analysis_factor_name=self.analysis_factor_name      #'Mutual Friend Relation'
        table_name=self.table_name                          #'raw_data_test'
        oem_name=self.oem_name                              #'Shabik_360'
        category=self.category                              #'evolution_analysis'

        key_prefix=analysis_factor_name.lower().replace(' ','_')+'_ol%s,n%s,s%s_evolution_' % (self.observation_day_length,1+helper_regex.get_day_diff_from_date_str(self.end_date,self.begin_date),self.observation_day_step)
        date=self.begin_date
        view_name='Report %s Evolution Analysis %s (%s Days Step, %s Days)' % (oem_name,analysis_factor_name,self.observation_day_step, self.observation_day_length)
        view_description=r'''
        
        Date Range of Observed User: %s to %s
        Total Observed New User: %s
        Observing Users' First %s Days

        ''' % (self.begin_date,self.end_date,len(self.temp_new_user_active_history_dict),self.observation_day_length)

        for active_time_period,v in temp_matrix_of_user_action.iteritems():
            for user_group_by_max_active_time,total_action in v.iteritems():
                
                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time], \
                                          date=date,table_name=table_name)

                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=len(temp_user_group[user_group_by_max_active_time]), \
                                          date=date,table_name=table_name)

                adjusted_base=0
                if temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]>0:
                    adjusted_base=1.0*self.observation_day_step \
                                        *temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time] \
                                        /temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]
                                        
                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base_adjusted', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=adjusted_base,date=date,table_name=table_name)

                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'total', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=len(self.temp_new_user_active_history_dict), \
                                          date=date,table_name=table_name)
        
        


        # generate view sql
        
        sql_template=r'''

        SELECT

        concat(
        'Online for '
        ,replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g',''),
        'd-'
        ,lpad(replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g','')+%(observation_day_step)s-1,2,'0')
        ,'d') as `Group Name`

        ,max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) as `Group Size`

        ,concat(format(100.0
        *max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) 
        /max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal',`value`,0)) 
        ,2),'%%%%') as `Group Proportion`

        %%(column_sql)s

        FROM `%(table_name)s` 

        WHERE (

        `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal'

        )and date='%(date)s'

        GROUP BY `Group Name`
        ORDER BY `Group Name` DESC

        ''' % {
            'observation_day_step':self.observation_day_step,
            'oem_name':oem_name,
            'category':category,
            'key_prefix':key_prefix,
            'table_name':table_name,
            'date':date,
        }


        sql_column_template=r'''
        
        ,case
        when max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0))>0 then
            max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0)) 
        else '-'
        end as `Avg %(analysis_factor_name)s Created in [%(active_time_period)sd,%(active_time_period_end)sd]`

        '''

        sql_columns=[]

        for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step):
            temp_sql_column=sql_column_template % {
                'observation_day_step':self.observation_day_step,
                'oem_name':oem_name,
                'category':category,
                'key_prefix':key_prefix,
                'table_name':table_name,
                'date':date,
                'active_time_period':str(active_time_period).zfill(2),
                'active_time_period_end':str(active_time_period+self.observation_day_step-1).zfill(2),
                'analysis_factor_name':analysis_factor_name,
            }
            sql_columns.append(temp_sql_column)
            
        
        view_sql=sql_template % {
            'column_sql':'\n'.join(sql_columns)
        }

        print view_sql

        import helper_view
        helper_view.replace_view(view_name=view_name,view_sql=view_sql,view_description=view_description,charting_javascript=r'''
        
        add_highcharts_basic_line_chart({
            'tab_name':'Trend Comparison',
            'column_names_pattern':/Online /ig,
            'marginRight':300,
            'reverse_key_column':false,
            'reverse_table':true,
            'exclude_rows':['Group Size','Group Proportion'],
            'reverse_column':true
        });

        ''')

        helper_view.grant_view(view_name,'5')
        helper_view.grant_view(view_name,'17')

        pass