def duplicate_record(item,target_date): step=10000 current_idx=0 helper_mysql.quick_insert=True helper_mysql.print_log=False sql=r"select * from %s where `oem_name`='%s' and `category`='%s'" % (item[0],item[1],item[2]) if item[3]: sql+=r" and `key`='%s'" % (item[3],) sql+=r" and `date`='%s'" % (target_date,) for i in range(1000): counter=1 sql_limit=r" limit %s,%s" % (current_idx,step) #print sql+sql_limit rows=helper_mysql.fetch_rows(sql+sql_limit,db_conn=item[4]) if not rows: print 'end.' break print target_date,current_idx,len(rows),item #do copy for row in rows: helper_mysql.put_raw_data( oem_name=row['oem_name'], category=row['category'], key=row['key'], sub_key=row['sub_key'], value=row['value'], table_name=item[0], date=row['date'], created_on=row['created_on'], db_conn=item[5]) counter+=1 #print counter current_idx+=step
import _mysql db_name='raw_data' oem_name='Vodafone' category='sub' key='daily_fresh_subscriber_include_unsub' sub_key='' date='2011-11-04' value='1024' helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key,sub_key=sub_key, \ value=value,table_name=db_name,date=date,created_on=None,db_conn=None) exit() helper_mysql.db.query(r''' delete from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `sub_key`='%s' and `date`='%s' ''' % (db_name,oem_name,category,key,sub_key,date)) print 'matched rows:'+str(helper_mysql.get_one_value_int(r''' select count(*) from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s' ''' % (db_name,oem_name,category,key,date))) print 'first value:'+str(helper_mysql.get_one_value_int(r''' select `value` from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s' ''' % (db_name,oem_name,category,key,date)))
break print 'length of source:',len(source_rows) print helper_regex.get_time_str_now() sql_temp=[] for row in source_rows: sql_temp.append("('%s','%s','%s','%s','%s')" % \ (row['id'],row['created_on'],row['element_count'],row['element_string_md5'],helper_mysql.escape_string(row['element_string']),)) current_start_id=max(current_start_id,int(row['id'])) for i in range(0,100000001,target_step): if i>len(sql_temp) or not sql_temp[i:min(i+target_step,len(sql_temp)+1)]: break sql='replace into '+table_name+' (id,created_on,element_count,element_string_md5,element_string) values '+(','.join(sql_temp[i:min(i+target_step,len(sql_temp)+1)])) print helper_regex.get_time_str_now()+' slice: ',i,', affeted:',helper_mysql.execute(sql,db_conn=target_conn) helper_mysql.put_raw_data(oem_name='Stat_Portal',category='data_migrate',key='max_transfered_id',sub_key=table_name,value=current_start_id,table_name='raw_data_debug',date='',db_conn=None) print '===saved max_id:',current_start_id print helper_regex.get_time_str_now() exit()
def calculate_ndays_unique(key_space,db_name,date_units): # calculate n days' unique min_date=helper_mysql.get_one_value_string(r''' select min(`date`) from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s_collection_id' and `sub_key`='%s' and `date`>='2011-12-16' ''' % (db_name,key_space['oem_name'],key_space['category'],key_space['key'],key_space['sub_key'])) max_date=helper_mysql.get_one_value_string(r''' select max(`date`) from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s_collection_id' and `sub_key`='%s' and `date`>='2011-12-16' ''' % (db_name,key_space['oem_name'],key_space['category'],key_space['key'],key_space['sub_key'])) if not min_date or not max_date: print 'date error.' return date_temp=min_date #print date_temp #exit() while True: if date_temp>=max_date: break for date_unit in date_units: unique,total,average=helper_math.calculate_count_distinct(date_unit=date_unit,oem_name=key_space['oem_name'],category=key_space['category'],key=key_space['key'],sub_key=key_space['sub_key'],date=date_temp,table_name=db_name,allow_collection_empty=True) print 'distinct collection calc '+date_temp+': date_unit '+str(date_unit)+' unique '+str(unique)+' total '+str(total)+' average '+str(average) #exit() key_prefix=helper_regex.regex_replace('_unique$','',key_space['key']) if unique>0: suffix=str(date_unit) if isinstance(date_unit, (int, long)): suffix+='_days' helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique',sub_key=key_space['sub_key'],value=unique,date=date_temp,table_name=db_name) helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique_base',sub_key=key_space['sub_key'],value=total,date=date_temp,table_name=db_name) helper_mysql.put_raw_data(oem_name=key_space['oem_name'],category=key_space['category'],key=key_prefix+'_'+suffix+'_unique_average',sub_key=key_space['sub_key'],value=average,date=date_temp,table_name=db_name) date_temp=helper_regex.date_add(date_temp,1)
def _do_calculation(self): #generate new user set temp_new_user_dict={} temp_date=self.begin_date temp_end_date=self.end_date while True: if temp_date>temp_end_date: break temp_daily_new_user_set=self._get_daily_created_user_dict(temp_date) for user_id in temp_daily_new_user_set: temp_new_user_dict[user_id]=temp_date temp_date=helper_regex.date_add(temp_date,1) print len(temp_new_user_dict) print temp_new_user_dict #exit() #generate user active hitory self.temp_new_user_active_history_dict={} temp_date=self.begin_date temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1) while True: if temp_date>temp_end_date: break temp_active_user_set=self._get_daily_active_user_set(temp_date) for user_id in temp_active_user_set: if temp_new_user_dict.has_key(user_id): self.temp_new_user_active_history_dict.setdefault(user_id,[]) if temp_date<=helper_regex.date_add(temp_new_user_dict[user_id],self.observation_day_length-1): self.temp_new_user_active_history_dict[user_id].append(temp_date) temp_date=helper_regex.date_add(temp_date,1) print len(self.temp_new_user_active_history_dict) print self.temp_new_user_active_history_dict #exit() #generate user action history temp_new_user_action_history_dict={} temp_date=self.begin_date temp_end_date=helper_regex.date_add(self.end_date,self.observation_day_length-1) while True: if temp_date>temp_end_date: break temp_user_action_dict=self._get_daily_user_action(temp_date) temp_user_action_dict=dict((k,v) for k,v in temp_user_action_dict.iteritems() if k in temp_new_user_dict) temp_new_user_action_history_dict[temp_date]=temp_user_action_dict temp_date=helper_regex.date_add(temp_date,1) print len(temp_new_user_action_history_dict) print temp_new_user_action_history_dict #exit() #generate user group temp_user_group={} for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step): temp_user_group[user_group_by_max_active_time]=set([user_id for user_id,history \ in self.temp_new_user_active_history_dict.iteritems() \ if len(history)>=user_group_by_max_active_time \ and len(history)<user_group_by_max_active_time \ +self.observation_day_step]) print len(temp_user_group) print temp_user_group #exit() #generate evolution matrix temp_matrix_of_user_action={} # Dimension-1:time Dimension-2:user groups temp_matrix_of_user_online_day={} for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step): for user_group_by_max_active_time in range(1,self.observation_day_length+1,self.observation_day_step): temp_matrix_of_user_action.setdefault(active_time_period,{}) temp_matrix_of_user_action[active_time_period].setdefault(user_group_by_max_active_time,0) temp_matrix_of_user_online_day.setdefault(active_time_period,{}) temp_matrix_of_user_online_day[active_time_period].setdefault(user_group_by_max_active_time,0) total_action=0 total_online_day=0 temp_user_set=temp_user_group[user_group_by_max_active_time] for user_id in temp_user_set: """ dates=self.temp_new_user_active_history_dict[user_id][active_time_period-1:min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id]))] total_online_day+=len(dates) for d in dates: if temp_new_user_action_history_dict[d].has_key(user_id): total_action+=temp_new_user_action_history_dict[d][user_id] """ if active_time_period>len(self.temp_new_user_active_history_dict[user_id]): continue dates=self.temp_new_user_active_history_dict[user_id] temp_begin_date=temp_new_user_dict[user_id] \ if active_time_period==1 \ else helper_regex.date_add(dates[active_time_period-1-1],1) # include those actions happend when user is offline temp_end_date=dates[-1] \ if active_time_period+self.observation_day_step-1>len(self.temp_new_user_active_history_dict[user_id]) \ else dates[active_time_period+self.observation_day_step-1-1] for temp_d in helper_regex.date_iterator(temp_begin_date,temp_end_date): if temp_new_user_action_history_dict[temp_d].has_key(user_id): total_action+=temp_new_user_action_history_dict[temp_d][user_id] total_online_day+=min(active_time_period+self.observation_day_step-1,len(self.temp_new_user_active_history_dict[user_id])) \ -(active_time_period-1) temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time]=total_action temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]=total_online_day print temp_matrix_of_user_action print temp_matrix_of_user_online_day #export result analysis_factor_name=self.analysis_factor_name #'Mutual Friend Relation' table_name=self.table_name #'raw_data_test' oem_name=self.oem_name #'Shabik_360' category=self.category #'evolution_analysis' key_prefix=analysis_factor_name.lower().replace(' ','_')+'_ol%s,n%s,s%s_evolution_' % (self.observation_day_length,1+helper_regex.get_day_diff_from_date_str(self.end_date,self.begin_date),self.observation_day_step) date=self.begin_date view_name='Report %s Evolution Analysis %s (%s Days Step, %s Days)' % (oem_name,analysis_factor_name,self.observation_day_step, self.observation_day_length) view_description=r''' Date Range of Observed User: %s to %s Total Observed New User: %s Observing Users' First %s Days ''' % (self.begin_date,self.end_date,len(self.temp_new_user_active_history_dict),self.observation_day_length) for active_time_period,v in temp_matrix_of_user_action.iteritems(): for user_group_by_max_active_time,total_action in v.iteritems(): helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time], \ date=date,table_name=table_name) helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=len(temp_user_group[user_group_by_max_active_time]), \ date=date,table_name=table_name) adjusted_base=0 if temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time]>0: adjusted_base=1.0*self.observation_day_step \ *temp_matrix_of_user_action[active_time_period][user_group_by_max_active_time] \ /temp_matrix_of_user_online_day[active_time_period][user_group_by_max_active_time] helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'unique_base_adjusted', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=adjusted_base,date=date,table_name=table_name) helper_mysql.put_raw_data(oem_name=oem_name,category=category,key=key_prefix+'total', \ sub_key='g'+str(user_group_by_max_active_time).zfill(2)+'_a'+str(active_time_period).zfill(2), \ value=len(self.temp_new_user_active_history_dict), \ date=date,table_name=table_name) # generate view sql sql_template=r''' SELECT concat( 'Online for ' ,replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g',''), 'd-' ,lpad(replace(SUBSTRING_INDEX(`sub_key`,'_',1),'g','')+%(observation_day_step)s-1,2,'0') ,'d') as `Group Name` ,max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) as `Group Size` ,concat(format(100.0 *max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique',`value`,0)) /max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal',`value`,0)) ,2),'%%%%') as `Group Proportion` %%(column_sql)s FROM `%(table_name)s` WHERE ( `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' or `oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)stotal' )and date='%(date)s' GROUP BY `Group Name` ORDER BY `Group Name` DESC ''' % { 'observation_day_step':self.observation_day_step, 'oem_name':oem_name, 'category':category, 'key_prefix':key_prefix, 'table_name':table_name, 'date':date, } sql_column_template=r''' ,case when max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0))>0 then max(if(`oem_name`='%(oem_name)s' and category='%(category)s' and `key` = '%(key_prefix)sunique_base_adjusted' and SUBSTRING_INDEX(`sub_key`,'_',-1)='a%(active_time_period)s',`value`,0)) else '-' end as `Avg %(analysis_factor_name)s Created in [%(active_time_period)sd,%(active_time_period_end)sd]` ''' sql_columns=[] for active_time_period in range(1,self.observation_day_length+1,self.observation_day_step): temp_sql_column=sql_column_template % { 'observation_day_step':self.observation_day_step, 'oem_name':oem_name, 'category':category, 'key_prefix':key_prefix, 'table_name':table_name, 'date':date, 'active_time_period':str(active_time_period).zfill(2), 'active_time_period_end':str(active_time_period+self.observation_day_step-1).zfill(2), 'analysis_factor_name':analysis_factor_name, } sql_columns.append(temp_sql_column) view_sql=sql_template % { 'column_sql':'\n'.join(sql_columns) } print view_sql import helper_view helper_view.replace_view(view_name=view_name,view_sql=view_sql,view_description=view_description,charting_javascript=r''' add_highcharts_basic_line_chart({ 'tab_name':'Trend Comparison', 'column_names_pattern':/Online /ig, 'marginRight':300, 'reverse_key_column':false, 'reverse_table':true, 'exclude_rows':['Group Size','Group Proportion'], 'reverse_column':true }); ''') helper_view.grant_view(view_name,'5') helper_view.grant_view(view_name,'17') pass
def stat_login(): global date_min,date_max,base_user_sets oem_name='All' stat_category='daily_active_user_retain' db_name='raw_data_login_trend' # you can change day range (30 days) date_max=helper_regex.date_add(helper_regex.get_date_str_now(),-1) date_min=helper_regex.date_add(date_max,-30) for i in range(1,10000): current_date=helper_regex.date_add(date_min,i) print 'current date',current_date if current_date>date_max: break # new user set from db (overall daily active user) new_user_set=gumi_helper_user.get_user_ids_created_by_date(current_date) # daily active user SG active_user_sg = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \ category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='SG', \ date=current_date, \ table_name='raw_data',db_conn=None) # daily active user US active_user_us = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \ category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='US', \ date=current_date, \ table_name='raw_data',db_conn=None) # daily active user PL active_user_pl = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \ category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='PL', \ date=current_date, \ table_name='raw_data',db_conn=None) # daily active user Unknow IP active_user_zz = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \ category='user',key='live_log_by_country_daily_uid_unique_collection_id',sub_key='ZZ', \ date=current_date, \ table_name='raw_data',db_conn=None) base_user_sets={ 'pt-new-user-':new_user_set, 'pt-new-user-SG':new_user_set & active_user_sg, 'pt-new-user-US':new_user_set & active_user_us, 'pt-new-user-PL':new_user_set & active_user_pl, 'pt-new-user-ZZ':new_user_set & active_user_zz } for k,user_set in base_user_sets.iteritems(): k=k.replace('*','') # calculate total print 'user base of',k,':',len(user_set) key='active_user_initial_%s_total_unique' % (k,) #sub_key = k[-2:] #if sub_key.find('-')>-1: # sub_key='' helper_mysql.put_raw_data(oem_name,stat_category,key,'',len(user_set),db_name,current_date) helper_mysql.put_collection(collection=user_set,oem_name=oem_name,category=stat_category, \ key=key,sub_key='',date=current_date,table_name=db_name) # calculate ranges=[(1,8,1),(1,30,7),(1,60,14)] for r in ranges: start=r[0] end=r[1] step=r[2] accumulative_logined_user={ 'pt':set([]), } for i in range(start,end,step): print start print end logined_user={ 'pt':set([]), } for day_delta in range(i,i+step): target_date=helper_regex.date_add(current_date,day_delta) collection = helper_mysql.get_raw_collection_from_key(oem_name='Gumi_puzzle', \ category='user',key='live_log_daily_uid_unique_collection_id',sub_key='', \ date=target_date, \ table_name='raw_data',db_conn=None) logined_user['pt'] = logined_user['pt'] | collection for k1,v1 in logined_user.iteritems(): accumulative_logined_user[k1] |= v1 for k,user_set in base_user_sets.iteritems(): k=k.replace('*','') logined_user_temp=set([]) if k.find('pt')>-1: logined_user_temp=logined_user['pt'] accumulative_logined_user_temp=accumulative_logined_user['pt'] base_user_logined_user= user_set & logined_user_temp key='daily_active_user_'+str(step)+'_day_logined_%s_total_unique' % (k,) helper_mysql.put_raw_data(oem_name,stat_category,key,i,len(base_user_logined_user),db_name,current_date) base_user_no_logined_user= user_set - accumulative_logined_user_temp key='daily_active_user_'+str(step)+'_day_no_logined_%s_total_unique' % (k,) helper_mysql.put_raw_data(oem_name,stat_category,key,i,len(base_user_no_logined_user),db_name,current_date) return
def run(self): self.reset() for log_source in self.log_sources: if isinstance(log_source, list): logFiles=log_source[:] else: logFiles=glob.glob(log_source) print logFiles for log_file_name in logFiles: self.current_file=log_file_name #check reprocess error if log_file_name.lower() in self.processed_log_files: raise Exception('reprocessing log: ',log_file_name) else: self.processed_log_files.add(log_file_name.lower()) #log file start helper_mysql.put_raw_data(oem_name=self.log_oem_name, \ category=self.log_category, \ key='original_file_size', \ sub_key=log_file_name, \ value=helper_file.get_file_size(log_file_name), \ table_name=self.log_table_name) file_size=0 line_count=0 print 'load file: '+log_file_name #log_file=open(log_file_name,'r',1024*1024*128) log_file=codecs.open(log_file_name,'r','utf-8',self.encode_exception_treatment,1024*1024*128) #pass the file path, name to stat sql's self.process_line('### Stat_Sql: File Path: '+log_file_name+' ###') for line in log_file: #print line line_count+=1 file_size+=len(line) self.process_line(line) if line_count % 100000==0: print 'line:',line_count log_file.close() print 'file size: ',file_size print 'line total: ',line_count self.line_processed+=line_count self.total_file_size+=file_size #log file end helper_mysql.put_raw_data(oem_name=self.log_oem_name, \ category=self.log_category, \ key='line_processed', \ sub_key=log_file_name, \ value=line_count, \ table_name=self.log_table_name) helper_mysql.put_raw_data(oem_name=self.log_oem_name, \ category=self.log_category, \ key='file_size_processed', \ sub_key=log_file_name, \ value=file_size, \ table_name=self.log_table_name) #dump processed logs print 'Dump processed log file list:' for i in sorted(list(self.processed_log_files)): print i for url in self.url_sources: print 'url_sources:'+url self.current_file=url print 'load url: '+url log_file=urllib.urlopen(url) #pass the file path, name to stat sql's self.process_line('### Stat_Sql: Url Path: '+url+' ###') while True: line=log_file.readline() if not line: break #print line self.line_processed+=1 self.process_line(line) for raw_content in self.raw_content_sources: print 'raw_content_sources:'+str(len(self.raw_content_sources)) self.current_file=raw_content[0:10] print 'load raw content: '+raw_content if not raw_content: print 'raw content empty..' break for line in raw_content.replace('\r\n','\n').replace('\r','\n').split('\n'): if not line: break self.line_processed+=1 self.process_line(line) self.do_calculation()