Beispiel #1
0
def duplicate_record(item,target_date):

    step=10000
    current_idx=0
    helper_mysql.quick_insert=True
    helper_mysql.print_log=False

    sql=r"select * from %s where `oem_name`='%s' and `category`='%s'" % (item[0],item[1],item[2])
    if item[3]:
        sql+=r" and `key`='%s'" % (item[3],)
    sql+=r" and `date`='%s'" % (target_date,)

    for i in range(1000):
        counter=1

        sql_limit=r" limit %s,%s" % (current_idx,step)

        #print sql+sql_limit

        rows=helper_mysql.fetch_rows(sql+sql_limit,db_conn=item[4])
        if not rows:
            print 'end.'
            break
        
        print target_date,current_idx,len(rows),item
        
        #do copy
        
        for row in rows:
            helper_mysql.put_raw_data(
                oem_name=row['oem_name'],
                category=row['category'],
                key=row['key'],
                sub_key=row['sub_key'],
                value=row['value'],
                table_name=item[0],
                date=row['date'],
                created_on=row['created_on'],
                db_conn=item[5])

            counter+=1
            #print counter
            
        current_idx+=step
Beispiel #2
0
import _mysql






db_name='raw_data'
oem_name='Vodafone'
category='sub'
key='daily_fresh_subscriber_include_unsub'
sub_key=''
date='2011-11-04'
value='1024'

helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key,sub_key=sub_key, \
value=value,table_name=db_name,date=date,created_on=None,db_conn=None)

exit()

helper_mysql.db.query(r'''
delete from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `sub_key`='%s' and `date`='%s'
''' % (db_name,oem_name,category,key,sub_key,date))

print 'matched rows:'+str(helper_mysql.get_one_value_int(r'''
select count(*) from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s'
''' % (db_name,oem_name,category,key,date)))

print 'first value:'+str(helper_mysql.get_one_value_int(r'''
select `value` from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s'
''' % (db_name,oem_name,category,key,date)))
            break        

        print 'length of source:',len(source_rows)
        print helper_regex.get_time_str_now()

        sql_temp=[]

        for row in source_rows:
            sql_temp.append("('%s','%s','%s','%s','%s')" % \
            (row['id'],row['created_on'],row['element_count'],row['element_string_md5'],helper_mysql.escape_string(row['element_string']),))
            current_start_id=max(current_start_id,int(row['id']))

        for i in range(0,100000001,target_step):
            if i>len(sql_temp) or not sql_temp[i:min(i+target_step,len(sql_temp)+1)]:
                break        
            sql='replace into '+table_name+' (id,created_on,element_count,element_string_md5,element_string) values '+(','.join(sql_temp[i:min(i+target_step,len(sql_temp)+1)]))

            print helper_regex.get_time_str_now()+' slice: ',i,', affeted:',helper_mysql.execute(sql,db_conn=target_conn)

        helper_mysql.put_raw_data(oem_name='Stat_Portal',category='data_migrate',key='max_transfered_id',sub_key=table_name,value=current_start_id,table_name='raw_data_debug',date='',db_conn=None)
        
        print '===saved max_id:',current_start_id


print helper_regex.get_time_str_now()

exit()



def calculate_ndays_unique(key_space,db_name,date_units):

    # calculate n days' unique

    min_date=helper_mysql.get_one_value_string(r'''
    
    select 
    
    min(`date`)

    from `%s`
    where `oem_name`='%s'
    and `category`='%s'
    and `key`='%s_collection_id'
    and `sub_key`='%s'
    and `date`>='2011-12-16'

    ''' % (db_name,key_space['oem_name'],key_space['category'],key_space['key'],key_space['sub_key']))


    max_date=helper_mysql.get_one_value_string(r'''
    
    select 
    
    max(`date`)

    from `%s`
    where `oem_name`='%s'
    and `category`='%s'
    and `key`='%s_collection_id'
    and `sub_key`='%s'
    and `date`>='2011-12-16'

    ''' % (db_name,key_space['oem_name'],key_space['category'],key_space['key'],key_space['sub_key']))


    if not min_date or not max_date:
        print 'date error.'
        return

    date_temp=min_date

    #print date_temp
    #exit()
    
    while True:

        if date_temp>=max_date:
            break

        for date_unit in date_units:

            unique,total,average=helper_math.calculate_count_distinct(date_unit=date_unit,oem_name=key_space['oem_name'],category=key_space['category'],key=key_space['key'],sub_key=key_space['sub_key'],date=date_temp,table_name=db_name,allow_collection_empty=True)

            print 'distinct collection calc '+date_temp+': date_unit '+str(date_unit)+' unique '+str(unique)+' total '+str(total)+' average '+str(average)
            #exit()

            key_prefix=helper_regex.regex_replace('_unique$','',key_space['key'])

            if unique>0:
                suffix=str(date_unit)
                if isinstance(date_unit, (int, long)):
                    suffix+='_days'

                helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique',sub_key=key_space['sub_key'],value=unique,date=date_temp,table_name=db_name)
                helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique_base',sub_key=key_space['sub_key'],value=total,date=date_temp,table_name=db_name)
                helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique_average',sub_key=key_space['sub_key'],value=average,date=date_temp,table_name=db_name)
        
        date_temp=helper_regex.date_add(date_temp,1)
    def _do_calculation(self):
        
        #generate new user set

        temp_new_user_dict={}

        temp_date=self.begin_date
        temp_end_date=self.end_date

        while True:
            if temp_date>temp_end_date:
                break
            temp_daily_new_user_set=self._get_daily_created_user_dict(temp_date)

            for user_id in temp_daily_new_user_set:
                temp_new_user_dict[user_id]=temp_date
            
            temp_date=helper_regex.date_add(temp_date,1)
            
        print len(temp_new_user_dict)
        print temp_new_user_dict
        #exit()

        #generate user active hitory

        self.temp_new_user_active_history_dict={}

        temp_date=self.begin_date
        temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1)

        while True:
            if temp_date>temp_end_date:
                break
            temp_active_user_set=self._get_daily_active_user_set(temp_date)
            for user_id in temp_active_user_set:
                if temp_new_user_dict.has_key(user_id):
                    self.temp_new_user_active_history_dict.setdefault(user_id,[])
                    if temp_date<=helper_regex.date_add(temp_new_user_dict[user_id],self.observation_day_length-1):
                        self.temp_new_user_active_history_dict[user_id].append(temp_date)
            
            temp_date=helper_regex.date_add(temp_date,1)
        
        print len(self.temp_new_user_active_history_dict)
        print self.temp_new_user_active_history_dict
        #exit()

        #generate user action history

        temp_new_user_action_history_dict={}

        temp_date=self.begin_date
        temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1)

        while True:
            if temp_date>temp_end_date:
                break
            temp_user_action_dict=self._get_daily_user_action(temp_date)
            temp_user_action_dict=dict((k,v) for k,v in temp_user_action_dict.iteritems() if k in temp_new_user_dict)
            temp_new_user_action_history_dict[temp_date]=temp_user_action_dict
            
            temp_date=helper_regex.date_add(temp_date,1)
        
        print len(temp_new_user_action_history_dict)
        print temp_new_user_action_history_dict
        #exit()

        #generate user group

        temp_user_group={}
        for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step):
            temp_user_group[user_group_by_max_active_time]=set([user_id for user_id,history \
                                                            in self.temp_new_user_active_history_dict.iteritems() \
                                                            if len(history)>=user_group_by_max_active_time \
                                                            and len(history)<user_group_by_max_active_time \
                                                            +self.observation_day_step])

        print len(temp_user_group)
        print temp_user_group
        #exit()
        
        #generate evolution matrix

        temp_matrix_of_user_action={}                              #   Dimension-1:time    Dimension-2:user groups
        temp_matrix_of_user_online_day={}

        for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step):
            for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step):
                
                temp_matrix_of_user_action.setdefault(active_time_period,{})
                temp_matrix_of_user_action[active_time_period].setdefault(user_group_by_max_active_time,0)
                
                temp_matrix_of_user_online_day.setdefault(active_time_period,{})
                temp_matrix_of_user_online_day[active_time_period].setdefault(user_group_by_max_active_time,0)

                total_action=0
                total_online_day=0

                temp_user_set=temp_user_group[user_group_by_max_active_time]

                for user_id in temp_user_set:
                    """
                    dates=self.temp_new_user_active_history_dict[user_id][active_time_period-1:min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id]))]
                    total_online_day+=len(dates)
                    
                    for d in dates:
                        if temp_new_user_action_history_dict[d].has_key(user_id):
                            total_action+=temp_new_user_action_history_dict[d][user_id]
                    """

                    if active_time_period>len(self.temp_new_user_active_history_dict[user_id]):
                        continue

                    dates=self.temp_new_user_active_history_dict[user_id]
                    
                    temp_begin_date=temp_new_user_dict[user_id] \
                                    if active_time_period==1 \
                                    else helper_regex.date_add(dates[active_time_period-1-1],1) # include those actions happend when user is offline

                    temp_end_date=dates[-1] \
                                  if active_time_period+self.observation_day_step-1>len(self.temp_new_user_active_history_dict[user_id]) \
                                  else dates[active_time_period+self.observation_day_step-1-1]
                    
                    for temp_d in helper_regex.date_iterator(temp_begin_date,temp_end_date):
                        if temp_new_user_action_history_dict[temp_d].has_key(user_id):
                            total_action+=temp_new_user_action_history_dict[temp_d][user_id]                        

                    total_online_day+=min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id])) \
                                      -(active_time_period-1)

                temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time]=total_action    
                temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]=total_online_day    

        print temp_matrix_of_user_action
        print temp_matrix_of_user_online_day
            
        #export result

        analysis_factor_name=self.analysis_factor_name      #'Mutual Friend Relation'
        table_name=self.table_name                          #'raw_data_test'
        oem_name=self.oem_name                              #'Shabik_360'
        category=self.category                              #'evolution_analysis'

        key_prefix=analysis_factor_name.lower().replace(' ','_')+'_ol%s,n%s,s%s_evolution_' % (self.observation_day_length,1+helper_regex.get_day_diff_from_date_str(self.end_date,self.begin_date),self.observation_day_step)
        date=self.begin_date
        view_name='Report %s Evolution Analysis %s (%s Days Step, %s Days)' % (oem_name,analysis_factor_name,self.observation_day_step, self.observation_day_length)
        view_description=r'''
        
        Date Range of Observed User: %s to %s
        Total Observed New User: %s
        Observing Users' First %s Days

        ''' % (self.begin_date,self.end_date,len(self.temp_new_user_active_history_dict),self.observation_day_length)

        for active_time_period,v in temp_matrix_of_user_action.iteritems():
            for user_group_by_max_active_time,total_action in v.iteritems():
                
                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time], \
                                          date=date,table_name=table_name)

                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=len(temp_user_group[user_group_by_max_active_time]), \
                                          date=date,table_name=table_name)

                adjusted_base=0
                if temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]>0:
                    adjusted_base=1.0*self.observation_day_step \
                                        *temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time] \
                                        /temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]
                                        
                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base_adjusted', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=adjusted_base,date=date,table_name=table_name)

                helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'total', \
                                          sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \
                                          value=len(self.temp_new_user_active_history_dict), \
                                          date=date,table_name=table_name)
        
        


        # generate view sql
        
        sql_template=r'''

        SELECT

        concat(
        'Online for '
        ,replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g',''),
        'd-'
        ,lpad(replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g','')+%(observation_day_step)s-1,2,'0')
        ,'d') as `Group Name`

        ,max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) as `Group Size`

        ,concat(format(100.0
        *max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) 
        /max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal',`value`,0)) 
        ,2),'%%%%') as `Group Proportion`

        %%(column_sql)s

        FROM `%(table_name)s` 

        WHERE (

        `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted'
        or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal'

        )and date='%(date)s'

        GROUP BY `Group Name`
        ORDER BY `Group Name` DESC

        ''' % {
            'observation_day_step':self.observation_day_step,
            'oem_name':oem_name,
            'category':category,
            'key_prefix':key_prefix,
            'table_name':table_name,
            'date':date,
        }


        sql_column_template=r'''
        
        ,case
        when max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0))>0 then
            max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0)) 
        else '-'
        end as `Avg %(analysis_factor_name)s Created in [%(active_time_period)sd,%(active_time_period_end)sd]`

        '''

        sql_columns=[]

        for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step):
            temp_sql_column=sql_column_template % {
                'observation_day_step':self.observation_day_step,
                'oem_name':oem_name,
                'category':category,
                'key_prefix':key_prefix,
                'table_name':table_name,
                'date':date,
                'active_time_period':str(active_time_period).zfill(2),
                'active_time_period_end':str(active_time_period+self.observation_day_step-1).zfill(2),
                'analysis_factor_name':analysis_factor_name,
            }
            sql_columns.append(temp_sql_column)
            
        
        view_sql=sql_template % {
            'column_sql':'\n'.join(sql_columns)
        }

        print view_sql

        import helper_view
        helper_view.replace_view(view_name=view_name,view_sql=view_sql,view_description=view_description,charting_javascript=r'''
        
        add_highcharts_basic_line_chart({
            'tab_name':'Trend Comparison',
            'column_names_pattern':/Online /ig,
            'marginRight':300,
            'reverse_key_column':false,
            'reverse_table':true,
            'exclude_rows':['Group Size','Group Proportion'],
            'reverse_column':true
        });

        ''')

        helper_view.grant_view(view_name,'5')
        helper_view.grant_view(view_name,'17')

        pass
def stat_login():
    global date_min,date_max,base_user_sets

    oem_name='All'
    stat_category='daily_active_user_retain'
    db_name='raw_data_login_trend'

    # you can change day range (30 days)
    date_max=helper_regex.date_add(helper_regex.get_date_str_now(),-1)
    date_min=helper_regex.date_add(date_max,-30)

    for i in range(1,10000):

        current_date=helper_regex.date_add(date_min,i)
        print 'current date',current_date
        
        if current_date>date_max:
            break
        
        # new user set from db (overall daily active user)
        new_user_set=gumi_helper_user.get_user_ids_created_by_date(current_date)
        # daily active user SG
        active_user_sg = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \
                        category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='SG', \
                        date=current_date, \
                        table_name='raw_data',db_conn=None)
        # daily active user US
        active_user_us = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \
                        category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='US', \
                        date=current_date, \
                        table_name='raw_data',db_conn=None)
        # daily active user PL
        active_user_pl = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \
                        category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='PL', \
                        date=current_date, \
                        table_name='raw_data',db_conn=None)
        # daily active user Unknow IP
        active_user_zz = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \
                        category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='ZZ', \
                        date=current_date, \
                        table_name='raw_data',db_conn=None)

        base_user_sets={
            'pt-new-user-':new_user_set,
            'pt-new-user-SG':new_user_set & active_user_sg,
            'pt-new-user-US':new_user_set & active_user_us,
            'pt-new-user-PL':new_user_set & active_user_pl,
            'pt-new-user-ZZ':new_user_set & active_user_zz
        }
        for k,user_set in base_user_sets.iteritems():
            k=k.replace('*','')
            # calculate total
            print 'user base of',k,':',len(user_set)
            key='active_user_initial_%s_total_unique' % (k,)
            #sub_key = k[-2:]
            #if sub_key.find('-')>-1:
            #    sub_key=''
            helper_mysql.put_raw_data(oem_name,stat_category,key,'',len(user_set),db_name,current_date)
            helper_mysql.put_collection(collection=user_set,oem_name=oem_name,category=stat_category, \
                                    key=key,sub_key='',date=current_date,table_name=db_name)

        # calculate 
        ranges=[(1,8,1),(1,30,7),(1,60,14)]

        for r in ranges:
            start=r[0]
            end=r[1]
            step=r[2]

            accumulative_logined_user={
                'pt':set([]),
            }
                
            for i in range(start,end,step):
                print start
                print end
                logined_user={
                    'pt':set([]),
                }

                for day_delta in range(i,i+step):
                    target_date=helper_regex.date_add(current_date,day_delta)
                    collection = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \
                        category='user',key='live_log_daily_uid_unique_collection_id',sub_key='', \
                        date=target_date, \
                        table_name='raw_data',db_conn=None) 
                    logined_user['pt'] = logined_user['pt'] | collection

                for k1,v1 in logined_user.iteritems():
                    accumulative_logined_user[k1] |= v1

                for k,user_set in base_user_sets.iteritems():
                    k=k.replace('*','')
                    
                    logined_user_temp=set([])

                    if k.find('pt')>-1:
                        logined_user_temp=logined_user['pt']
                        accumulative_logined_user_temp=accumulative_logined_user['pt']

                    base_user_logined_user= user_set & logined_user_temp
                    key='daily_active_user_'+str(step)+'_day_logined_%s_total_unique' % (k,)
                    helper_mysql.put_raw_data(oem_name,stat_category,key,i,len(base_user_logined_user),db_name,current_date)
                    
                    base_user_no_logined_user= user_set - accumulative_logined_user_temp 
                    key='daily_active_user_'+str(step)+'_day_no_logined_%s_total_unique' % (k,)
                    helper_mysql.put_raw_data(oem_name,stat_category,key,i,len(base_user_no_logined_user),db_name,current_date)

    return
Beispiel #7
0
    def run(self):
        self.reset()
        
        for log_source in self.log_sources:
            if isinstance(log_source, list):
                logFiles=log_source[:]
            else:
                logFiles=glob.glob(log_source)
            
            print logFiles
            
            for log_file_name in logFiles:
                self.current_file=log_file_name
                
                #check reprocess error
                if log_file_name.lower() in self.processed_log_files:
                    raise Exception('reprocessing log: ',log_file_name)
                else:
                    self.processed_log_files.add(log_file_name.lower())

                #log file start
                helper_mysql.put_raw_data(oem_name=self.log_oem_name, \
                                          category=self.log_category, \
                                          key='original_file_size', \
                                          sub_key=log_file_name, \
                                          value=helper_file.get_file_size(log_file_name), \
                                          table_name=self.log_table_name)
                
                file_size=0
                line_count=0

                print 'load file: '+log_file_name

                #log_file=open(log_file_name,'r',1024*1024*128)
                log_file=codecs.open(log_file_name,'r','utf-8',self.encode_exception_treatment,1024*1024*128)
                
                #pass the file path, name to stat sql's
                self.process_line('### Stat_Sql: File Path: '+log_file_name+' ###')

                for line in log_file:
                    #print line
                    line_count+=1
                    file_size+=len(line)

                    self.process_line(line)
                    if line_count % 100000==0:
                        print 'line:',line_count

                log_file.close()    
                
                print 'file size: ',file_size
                print 'line total: ',line_count

                self.line_processed+=line_count
                self.total_file_size+=file_size
        
                #log file end
                helper_mysql.put_raw_data(oem_name=self.log_oem_name, \
                                          category=self.log_category, \
                                          key='line_processed', \
                                          sub_key=log_file_name, \
                                          value=line_count, \
                                          table_name=self.log_table_name)

                helper_mysql.put_raw_data(oem_name=self.log_oem_name, \
                                          category=self.log_category, \
                                          key='file_size_processed', \
                                          sub_key=log_file_name, \
                                          value=file_size, \
                                          table_name=self.log_table_name)

        #dump processed logs
        print 'Dump processed log file list:'
        for i in sorted(list(self.processed_log_files)):
            print i
                    

        for url in self.url_sources:

            print 'url_sources:'+url
            
            self.current_file=url
            
            print 'load url: '+url
            log_file=urllib.urlopen(url)
                
            #pass the file path, name to stat sql's
            self.process_line('### Stat_Sql: Url Path: '+url+' ###')

            while True:
                line=log_file.readline()
                if not line:
                    break
                #print line

                self.line_processed+=1
                self.process_line(line)
        
        for raw_content in self.raw_content_sources:

            print 'raw_content_sources:'+str(len(self.raw_content_sources))
            
            self.current_file=raw_content[0:10]
            
            print 'load raw content: '+raw_content

            if not raw_content:
                print 'raw content empty..'
                break

            for line in raw_content.replace('\r\n','\n').replace('\r','\n').split('\n'):
                if not line:
                    break
                self.line_processed+=1
                self.process_line(line)

        self.do_calculation()