def calculate_retain_rate_of_2_collections(collection_id_current,collection_id_base): base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1 retained_base_size,lost_base_size,fresh_base_size=0,0,0 col_1=helper_mysql.get_raw_collection_by_id(collection_id_current) col_2=helper_mysql.get_raw_collection_by_id(collection_id_base) base_size=len(col_1) retain=col_1 & col_2 fresh=col_1 - col_2 lost=col_2 - col_1 retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh) if len(col_2)>0: retain_rate=1.0*len(retain)/len(col_2) if len(col_1)>0 and len(col_2)>0: fresh_rate=1.0*len(fresh)/len(col_1) if len(col_2)>0: lost_rate=1.0*len(lost)/len(col_2) return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
def process(my_date): global whole_collection, element_existance_counters, sql_collection_id current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d') collection_ids = helper_mysql.fetch_dict(sql_collection_id % (current_date,)) for app,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection_id or not collection: print 'No collection of',current_date,collection_id continue if not whole_collection.has_key(app): whole_collection[app]=set([]) whole_collection[app]|=collection for i in collection: if not element_existance_counters.has_key(i): element_existance_counters[i]={} if not element_existance_counters[i].has_key(app): element_existance_counters[i][app]=0 element_existance_counters[i][app]+=1
def process(my_date): global whole_collection, element_existance_counters, sql_collection_id current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d') collection_id = helper_mysql.get_one_value_int(sql_collection_id % (current_date,)) collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection_id or not collection: print 'No collection of',current_date,collection_id return whole_collection|=collection for i in collection: if not element_existance_counters.has_key(i): element_existance_counters[i]=0 element_existance_counters[i]+=1
def process(client_type): global whole_collection, element_existance_counters, sql_collection_id, client_dict current_collection={} type_number=client_number_dict[client_type] collection_ids = helper_mysql.fetch_dict(sql_collection_id % (client_type,)) print collection_ids for date,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection_id or not collection: print 'No collection of',collection_id continue whole_collection|=collection for i in collection: current_collection[i]=type_number print len(current_collection) field_name='client-type' sql=r''' update mozat_clustering.user_figure_base set `%s`='%s' where `oem_id`=7 and `user_id`='%s' limit 1 ''' for user_id, t in current_collection.iteritems(): helper_mysql.execute(sql % (field_name,t,user_id))
def calculate_date_range_retain_rate(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'): if date_unit<1: date_unit=1 base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1 retained_base_size,lost_base_size,fresh_base_size=0,0,0 if not date: return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \ # %(table_name,oem_name,category,key,sub_key) key=key.replace('_collection_id','')+'_collection_id' sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_1 |= col_temp if col_id_temp==0: #force return null when data not complete return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size base_size=len(col_1) col_2=set([]) for i in range(0+date_unit,date_unit+date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_2 |= col_temp retain=col_1 & col_2 fresh=col_1 - col_2 lost=col_2 - col_1 """ print str(col_1) print str(col_2) print str(retain) print str(fresh) print str(lost) """ retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh) if len(col_2)>0: retain_rate=1.0*len(retain)/len(col_2) if len(col_1)>0 and len(col_2)>0: fresh_rate=1.0*len(fresh)/len(col_1) if len(col_2)>0: lost_rate=1.0*len(lost)/len(col_2) return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
def calculate_count_distinct(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False): #date_unit accepts 1,2,3,...,'weekly','monthly' #for weekly, it produces result only when date is Sunday, else 0 #for monthly, it produces result only when date is the last day of a week, else 0 #for all cases, it doesn't produce value when required collections are not all ready unique=0 total=0 average=0 if not date: return unique,total,average if date_unit=='weekly': if helper_regex.get_weekday_from_date_str(date)!=7: return unique,total,average date_unit=7 elif date_unit=='monthly': if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01': return unique,total,average first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01' date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1 if date_unit<1: date_unit=1 key=key.replace('_collection_id','') sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_collection_id',sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name) #print sql collection_base_dict=helper_mysql.fetch_dict(sql) #print collection_base_dict #exit() """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) base_total=0 for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_1 |= col_temp base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0 if col_id_temp==0: #force return null when data not complete if allow_collection_empty: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.' else: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.' return unique,total,average unique=len(col_1) total=base_total average=base_total*1.0/unique if unique>0 else 0 return unique,total,average
def calculate_date_range_average_life_cycle(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'): if date_unit<1: date_unit=1 #base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1 lost_col_average_life_cycle=0 retained_col_average_life_cycle=0 if not date: return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{} #return base_size,retain_rate,fresh_rate,lost_rate #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \ # %(table_name,oem_name,category,key,sub_key) key=key.replace('_collection_id','')+'_collection_id' sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_1 |= col_temp if col_id_temp==0: #force return null when data not complete return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{} base_size=len(col_1) col_2=set([]) for i in range(0+date_unit,date_unit+date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_2 |= col_temp lost_col=col_2 - col_1 retained_col=col_2 & col_1 lost_col_len=len(lost_col) retained_col_len=len(retained_col) lost_col_dict=dict([(k, 0) for k in lost_col]) retained_col_dict=dict([(k, 0) for k in retained_col]) for i in range(0,2000): date_temp=helper_regex.date_add(date,-i) if date_temp=='2010-01-01': break col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) for i in col_temp: if lost_col_dict.has_key(i): lost_col_dict[i]+=1 if retained_col_dict.has_key(i): retained_col_dict[i]+=1 if lost_col_len>0: lost_col_average_life_cycle=sum(lost_col_dict.values())*1.0/lost_col_len if retained_col_len>0: retained_col_average_life_cycle=sum(retained_col_dict.values())*1.0/retained_col_len return lost_col_average_life_cycle,retained_col_average_life_cycle,lost_col_dict,retained_col_dict
where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s' and `date`!='' ''' % (db_name,oem_name,category,key,sub_key) base_collection_ids = helper_mysql.fetch_dict(sql) #target collection target_user_set=set([]) for date_str,collection_id in target_collection_ids.iteritems(): collection_temp=helper_mysql.get_raw_collection_by_id(collection_id) target_user_set|=collection_temp #start / end date start_date={} end_date={} date_order=base_collection_ids.keys()[:] date_order.sort() for date_str in date_order: collection_id=base_collection_ids[date_str] collection_temp=helper_mysql.get_raw_collection_by_id(collection_id)
def export(target_key): lacked_dates=[] exported_dates=[] dir_name='.\\export_collection\\'+'_'.join(target_key).strip('_') collection_ids=helper_mysql.fetch_dict(sql=r''' select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s' order by date desc limit 200 ''' % (target_key[0],target_key[1],target_key[2],target_key[3],target_key[4],)) helper_file.prepare_directory_on_windows(dir_name) print 'collection_ids:',len(collection_ids) # load user_id->msisdn mapping user_id_to_msisdn={} file_handler=open('E:\\WebStatShare\\vodafone_user_id_to_msisdn.csv',"r") for line in file_handler: line=line.strip(os.linesep).strip() #print line if not line: continue c=line.find(',') if c==-1: continue msisdn=line[c+1:].strip('X').strip() if msisdn.isdigit(): user_id_to_msisdn[int(line[0:c])]=int(msisdn) file_handler.close() print 'user_id_to_msisdn:',len(user_id_to_msisdn) for date,collection_id in collection_ids.iteritems(): collection=helper_mysql.get_raw_collection_by_id(collection_id) if not collection: lacked_dates.append(date) continue msisdn_set=set([]) for i in collection: if not i.isdigit(): continue user_id=int(i) if user_id_to_msisdn.has_key(user_id): msisdn_set.add(user_id_to_msisdn[user_id]) #print user_id,user_id_to_msisdn[user_id] #continue #print user_id helper_file.write_big_string_set_to_file(dir_name+'\\'+date+'.txt',msisdn_set) exported_dates.append((date,len(collection),len(msisdn_set))) print 'lacked_dates:',lacked_dates print 'exported_dates',exported_dates
""" % ( table, oem_name, category, key, sub_key, date_start, date_end, ) collection_ids = helper_mysql.fetch_set(sql) # print collection_ids result_set = set([]) for id in collection_ids: temp_set = helper_mysql.get_raw_collection_by_id(int(id)) result_set |= temp_set content = "\n".join(sorted([str(i) for i in result_set])) sys.stdout = temp_stdout print "Content-type: application/x-unknown" print "Content-Disposition: attachment; filename=data.txt" print "Content-Length: " + str(len(content)) print "" print content pass except: content = r""" This download looks out of date. PLease refresh the statistics page and try download again.