def process(my_date): global whole_collection, element_existance_counters, sql_collection_id current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d') collection_ids = helper_mysql.fetch_dict(sql_collection_id % (current_date,)) for app,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection_id or not collection: print 'No collection of',current_date,collection_id continue if not whole_collection.has_key(app): whole_collection[app]=set([]) whole_collection[app]|=collection for i in collection: if not element_existance_counters.has_key(i): element_existance_counters[i]={} if not element_existance_counters[i].has_key(app): element_existance_counters[i][app]=0 element_existance_counters[i][app]+=1
def process(client_type): global whole_collection, element_existance_counters, sql_collection_id, client_dict current_collection={} type_number=client_number_dict[client_type] collection_ids = helper_mysql.fetch_dict(sql_collection_id % (client_type,)) print collection_ids for date,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection_id or not collection: print 'No collection of',collection_id continue whole_collection|=collection for i in collection: current_collection[i]=type_number print len(current_collection) field_name='client-type' sql=r''' update mozat_clustering.user_figure_base set `%s`='%s' where `oem_id`=7 and `user_id`='%s' limit 1 ''' for user_id, t in current_collection.iteritems(): helper_mysql.execute(sql % (field_name,t,user_id))
def get_cached_sql_result_as_dict(sql,db_conn): if db_conn['db_type'] not in ('sql_server','mysql'): raise Error('db_type error') unique_key=helper_math.md5(','.join(sorted(db_conn.values()))+sql) cached_result=helper_mysql.get_dict_of_raw_collection_from_key(oem_name='',category='', key='',\ sub_key=unique_key,date='',table_name='raw_data_cache_sql_result',db_conn=None) if cached_result: print 'get from cache:',unique_key return cached_result if db_conn['db_type']=='sql_server': cached_result=helper_sql_server.fetch_dict(conn_config=db_conn,sql=sql) else: cached_result=helper_mysql.fetch_dict(db_conn=db_conn,sql=sql) helper_mysql.put_collection_with_value(collection=cached_result,oem_name='',category='',key='',sub_key=unique_key, \ table_name='raw_data_cache_sql_result',date='',created_on=None,db_conn=None) print 'push into cache:',unique_key return cached_result
def process(): global sql_collection_id current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d') info_dict=helper_mysql.fetch_dict(sql_collection_id) print len(info_dict) sql=r''' update mozat_clustering.user_figure_base set `%s`='%s' where `oem_id`=7 and `user_id`='%s' limit 1 ''' field_name='client-type' for user_id,value in info_dict: helper_mysql.execute(sql % (field_name,value,user_id))
% (db_name,oem_name,category,key,sub_key,value,date)) print 'updated value:'+str(helper_mysql.get_one_value_int(r''' select `value` from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s' ''' % (db_name,oem_name,category,key,date))) exit() for i in range(1,100,1): sql=r''' select `sub_key`,`value` from raw_data_url_pattern where `oem_name`='STC' and `category`='moagent' and `key`='app_page_by_url_pattern_daily_visitor_unique' and `date`="%s" and `sub_key` like '%%jit%%' order by date desc; ''' % (helper_regex.date_add('2011-06-01',i),) #print sql print helper_regex.date_add('2011-07-01',i) result=helper_mysql.fetch_dict(sql) for k,v in result.iteritems(): print k,v exit()
def calculate_date_range_retain_rate(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'): if date_unit<1: date_unit=1 base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1 retained_base_size,lost_base_size,fresh_base_size=0,0,0 if not date: return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \ # %(table_name,oem_name,category,key,sub_key) key=key.replace('_collection_id','')+'_collection_id' sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_1 |= col_temp if col_id_temp==0: #force return null when data not complete return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size base_size=len(col_1) col_2=set([]) for i in range(0+date_unit,date_unit+date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_2 |= col_temp retain=col_1 & col_2 fresh=col_1 - col_2 lost=col_2 - col_1 """ print str(col_1) print str(col_2) print str(retain) print str(fresh) print str(lost) """ retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh) if len(col_2)>0: retain_rate=1.0*len(retain)/len(col_2) if len(col_1)>0 and len(col_2)>0: fresh_rate=1.0*len(fresh)/len(col_1) if len(col_2)>0: lost_rate=1.0*len(lost)/len(col_2) return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
def calculate_count_distinct_named_collection(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False): #date_unit accepts 1,2,3,...,'weekly','monthly' #for weekly, it produces result only when date is Sunday, else 0 #for monthly, it produces result only when date is the last day of a week, else 0 #for all cases, it doesn't produce value when required collections are not all ready unique=0 total=0 average=0 if not date: return unique,total,average if date_unit=='weekly': if helper_regex.get_weekday_from_date_str(date)!=7: return unique,total,average date_unit=7 elif date_unit=='monthly': if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01': return unique,total,average first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01' date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1 if date_unit<1: date_unit=1 key=key.replace('_collection_id','') sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name) #print sql collection_base_dict=helper_mysql.fetch_dict(sql) #print collection_base_dict """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) base_total=0 for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 #col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_temp=helper_collection.get_named_collection(table_name=table_name,oem_name=oem_name,category=category, \ key=key,sub_key=sub_key,date=date_temp) col_1 |= col_temp base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0 if col_id_temp==0: #force return null when data not complete if allow_collection_empty: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.' else: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.' return unique,total,average unique=len(col_1) total=base_total average=base_total*1.0/unique if unique>0 else 0 return unique,total,average
def calculate_date_range_average_life_cycle(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'): if date_unit<1: date_unit=1 #base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1 lost_col_average_life_cycle=0 retained_col_average_life_cycle=0 if not date: return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{} #return base_size,retain_rate,fresh_rate,lost_rate #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \ # %(table_name,oem_name,category,key,sub_key) key=key.replace('_collection_id','')+'_collection_id' sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_1 |= col_temp if col_id_temp==0: #force return null when data not complete return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{} base_size=len(col_1) col_2=set([]) for i in range(0+date_unit,date_unit+date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_2 |= col_temp lost_col=col_2 - col_1 retained_col=col_2 & col_1 lost_col_len=len(lost_col) retained_col_len=len(retained_col) lost_col_dict=dict([(k, 0) for k in lost_col]) retained_col_dict=dict([(k, 0) for k in retained_col]) for i in range(0,2000): date_temp=helper_regex.date_add(date,-i) if date_temp=='2010-01-01': break col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) for i in col_temp: if lost_col_dict.has_key(i): lost_col_dict[i]+=1 if retained_col_dict.has_key(i): retained_col_dict[i]+=1 if lost_col_len>0: lost_col_average_life_cycle=sum(lost_col_dict.values())*1.0/lost_col_len if retained_col_len>0: retained_col_average_life_cycle=sum(retained_col_dict.values())*1.0/retained_col_len return lost_col_average_life_cycle,retained_col_average_life_cycle,lost_col_dict,retained_col_dict
#fetch collection id sql=r''' select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s' and `date`>='%s' and `date`<='%s' ''' % (db_name,oem_name,category,key,sub_key,date_begin,date_end) target_collection_ids = helper_mysql.fetch_dict(sql) sql=r''' select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s' and `date`!='' ''' % (db_name,oem_name,category,key,sub_key) base_collection_ids = helper_mysql.fetch_dict(sql) #target collection
def export(target_key): lacked_dates=[] exported_dates=[] dir_name='.\\export_collection\\'+'_'.join(target_key).strip('_') collection_ids=helper_mysql.fetch_dict(sql=r''' select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s' order by date desc limit 200 ''' % (target_key[0],target_key[1],target_key[2],target_key[3],target_key[4],)) helper_file.prepare_directory_on_windows(dir_name) print 'collection_ids:',len(collection_ids) # load user_id->msisdn mapping user_id_to_msisdn={} file_handler=open('E:\\WebStatShare\\vodafone_user_id_to_msisdn.csv',"r") for line in file_handler: line=line.strip(os.linesep).strip() #print line if not line: continue c=line.find(',') if c==-1: continue msisdn=line[c+1:].strip('X').strip() if msisdn.isdigit(): user_id_to_msisdn[int(line[0:c])]=int(msisdn) file_handler.close() print 'user_id_to_msisdn:',len(user_id_to_msisdn) for date,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection: lacked_dates.append(date) continue msisdn_set=set([]) for i in collection: if not i.isdigit(): continue user_id=int(i) if user_id_to_msisdn.has_key(user_id): msisdn_set.add(user_id_to_msisdn[user_id]) #print user_id,user_id_to_msisdn[user_id] #continue #print user_id helper_file.write_big_string_set_to_file(dir_name+'\\'+date+'.txt',msisdn_set) exported_dates.append((date,len(collection),len(msisdn_set))) print 'lacked_dates:',lacked_dates print 'exported_dates',exported_dates