Esempio n. 1
0
def calculate_retain_rate_of_2_collections(collection_id_current,collection_id_base):
        
    base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1
    retained_base_size,lost_base_size,fresh_base_size=0,0,0
    
    col_1=helper_mysql.get_raw_collection_by_id(collection_id_current)
    col_2=helper_mysql.get_raw_collection_by_id(collection_id_base)

    base_size=len(col_1)

    retain=col_1 & col_2
    fresh=col_1 - col_2
    lost=col_2 - col_1
    
    retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh)
    
    if len(col_2)>0:
        retain_rate=1.0*len(retain)/len(col_2)
        
    if len(col_1)>0 and len(col_2)>0:
        fresh_rate=1.0*len(fresh)/len(col_1)
        
    if len(col_2)>0:
        lost_rate=1.0*len(lost)/len(col_2)
        
    return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
def process(my_date):

    global whole_collection, element_existance_counters, sql_collection_id

    current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d')
    
    collection_ids = helper_mysql.fetch_dict(sql_collection_id % (current_date,))

    for app,collection_id in collection_ids.iteritems():

        collection=helper_mysql.get_raw_collection_by_id(collection_id)

        if not collection_id or not collection:
            print 'No collection of',current_date,collection_id
            continue
        
        if not whole_collection.has_key(app):
            whole_collection[app]=set([])

        whole_collection[app]|=collection

        for i in collection:
            if not element_existance_counters.has_key(i):
                element_existance_counters[i]={}

            if not element_existance_counters[i].has_key(app):
                element_existance_counters[i][app]=0

            element_existance_counters[i][app]+=1
def process(my_date):

    global whole_collection, element_existance_counters, sql_collection_id

    current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d')
    
    collection_id = helper_mysql.get_one_value_int(sql_collection_id % (current_date,))

    collection=helper_mysql.get_raw_collection_by_id(collection_id)

    if not collection_id or not collection:
        print 'No collection of',current_date,collection_id
        return
    
    whole_collection|=collection

    for i in collection:
        if not element_existance_counters.has_key(i):
            element_existance_counters[i]=0
        element_existance_counters[i]+=1
def process(client_type):

    global whole_collection, element_existance_counters, sql_collection_id, client_dict

    current_collection={}

    type_number=client_number_dict[client_type]
  
    collection_ids = helper_mysql.fetch_dict(sql_collection_id % (client_type,))

    print collection_ids

    for date,collection_id in collection_ids.iteritems():

        collection=helper_mysql.get_raw_collection_by_id(collection_id)

        if not collection_id or not collection:
            print 'No collection of',collection_id
            continue
        
        whole_collection|=collection

        for i in collection:
            current_collection[i]=type_number

    print len(current_collection)

    field_name='client-type'

    sql=r'''
    
    update mozat_clustering.user_figure_base
    set `%s`='%s'
    where `oem_id`=7
    and `user_id`='%s'
    limit 1

    '''

    for user_id, t in current_collection.iteritems():
        helper_mysql.execute(sql % (field_name,t,user_id))
Esempio n. 5
0
def calculate_date_range_retain_rate(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'):

    if date_unit<1:
        date_unit=1        
        
    base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1
    retained_base_size,lost_base_size,fresh_base_size=0,0,0
    
    if not date:
        return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
    
    #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \
    # %(table_name,oem_name,category,key,sub_key)

    key=key.replace('_collection_id','')+'_collection_id'
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)
    
    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_1 |= col_temp

        if col_id_temp==0: #force return null when data not complete
            return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size

    base_size=len(col_1)
    
    col_2=set([])
    for i in range(0+date_unit,date_unit+date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_2 |= col_temp

    retain=col_1 & col_2
    fresh=col_1 - col_2
    lost=col_2 - col_1
    
    """
    print str(col_1)
    print str(col_2)
    print str(retain)
    print str(fresh)
    print str(lost)
    """

    retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh)
    
    if len(col_2)>0:
        retain_rate=1.0*len(retain)/len(col_2)
        
    if len(col_1)>0 and len(col_2)>0:
        fresh_rate=1.0*len(fresh)/len(col_1)
        
    if len(col_2)>0:
        lost_rate=1.0*len(lost)/len(col_2)
        
    return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
Esempio n. 6
0
def calculate_count_distinct(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False):
    
    #date_unit accepts 1,2,3,...,'weekly','monthly'
    #for weekly, it produces result only when date is Sunday, else 0
    #for monthly, it produces result only when date is the last day of a week, else 0
    #for all cases, it doesn't produce value when required collections are not all ready

    unique=0
    total=0
    average=0

    if not date:
        return unique,total,average

    if date_unit=='weekly':
        if helper_regex.get_weekday_from_date_str(date)!=7:
            return unique,total,average
        date_unit=7

    elif date_unit=='monthly':
        if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01':
            return unique,total,average

        first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01'
        date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1

    if date_unit<1:
        date_unit=1        

    key=key.replace('_collection_id','')
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_collection_id',sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)

    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)


    sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name)
    
    #print sql
    collection_base_dict=helper_mysql.fetch_dict(sql)

    #print collection_base_dict
    #exit()
    
    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    base_total=0
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)

        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_1 |= col_temp

        base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0
        
        if col_id_temp==0: #force return null when data not complete
            if allow_collection_empty:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.'
            else:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.'
                return unique,total,average

    unique=len(col_1)
    total=base_total
    average=base_total*1.0/unique if unique>0 else 0

    return unique,total,average
Esempio n. 7
0
def calculate_date_range_average_life_cycle(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'):
    if date_unit<1:
        date_unit=1        
        
    #base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1
    lost_col_average_life_cycle=0
    retained_col_average_life_cycle=0
    
    if not date:
        return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{}
        #return base_size,retain_rate,fresh_rate,lost_rate
    
    #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \
    # %(table_name,oem_name,category,key,sub_key)
    
    key=key.replace('_collection_id','')+'_collection_id'
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)
    
    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_1 |= col_temp
        
        if col_id_temp==0: #force return null when data not complete
            return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{}
    
    base_size=len(col_1)
    
    col_2=set([])
    for i in range(0+date_unit,date_unit+date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_2 |= col_temp

    
    lost_col=col_2 - col_1
    retained_col=col_2 & col_1

    lost_col_len=len(lost_col)
    retained_col_len=len(retained_col)

    lost_col_dict=dict([(k, 0) for k in lost_col])
    retained_col_dict=dict([(k, 0) for k in retained_col])

    for i in range(0,2000):
        date_temp=helper_regex.date_add(date,-i)
        
        if date_temp=='2010-01-01':
            break

        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)

        for i in col_temp:
            if lost_col_dict.has_key(i):
                lost_col_dict[i]+=1
            if retained_col_dict.has_key(i):
                retained_col_dict[i]+=1

    if lost_col_len>0:
        lost_col_average_life_cycle=sum(lost_col_dict.values())*1.0/lost_col_len

    if retained_col_len>0:
        retained_col_average_life_cycle=sum(retained_col_dict.values())*1.0/retained_col_len

    return lost_col_average_life_cycle,retained_col_average_life_cycle,lost_col_dict,retained_col_dict
Esempio n. 8
0
    where `oem_name`='%s'
    and `category`='%s'
    and `key`='%s'
    and `sub_key`='%s'
    and `date`!=''
''' % (db_name,oem_name,category,key,sub_key)

base_collection_ids = helper_mysql.fetch_dict(sql)


#target collection

target_user_set=set([])

for date_str,collection_id in target_collection_ids.iteritems():
    collection_temp=helper_mysql.get_raw_collection_by_id(collection_id)
    target_user_set|=collection_temp


#start / end date

start_date={}
end_date={}

date_order=base_collection_ids.keys()[:]
date_order.sort()


for date_str in date_order:
    collection_id=base_collection_ids[date_str]
    collection_temp=helper_mysql.get_raw_collection_by_id(collection_id)
Esempio n. 9
0
def export(target_key):
    
    lacked_dates=[]
    exported_dates=[]

    dir_name='.\\export_collection\\'+'_'.join(target_key).strip('_')

    collection_ids=helper_mysql.fetch_dict(sql=r'''
    
    select `date`,`value`
    from `%s`
    where 
    `oem_name`='%s'
    and `category`='%s'
    and `key`='%s'
    and `sub_key`='%s'
    order by date desc
    limit 200
    
    ''' % (target_key[0],target_key[1],target_key[2],target_key[3],target_key[4],))

    helper_file.prepare_directory_on_windows(dir_name)

    print 'collection_ids:',len(collection_ids) 


    # load user_id->msisdn mapping


    user_id_to_msisdn={}
    file_handler=open('E:\\WebStatShare\\vodafone_user_id_to_msisdn.csv',"r")
    for line in file_handler:
        line=line.strip(os.linesep).strip()
        #print line
        if not line:
            continue
        
        c=line.find(',')
        if c==-1:
            continue
        msisdn=line[c+1:].strip('X').strip()
        if msisdn.isdigit():
            user_id_to_msisdn[int(line[0:c])]=int(msisdn)

    file_handler.close()

    print 'user_id_to_msisdn:',len(user_id_to_msisdn)



    for date,collection_id in collection_ids.iteritems():
        
        collection=helper_mysql.get_raw_collection_by_id(collection_id)
        if not collection:
            lacked_dates.append(date)
            continue
        
        msisdn_set=set([])

        for i in collection:
            if not i.isdigit():
                continue
            user_id=int(i)
            
            if user_id_to_msisdn.has_key(user_id):
                msisdn_set.add(user_id_to_msisdn[user_id])
                #print user_id,user_id_to_msisdn[user_id]
                #continue
            #print user_id
        
        helper_file.write_big_string_set_to_file(dir_name+'\\'+date+'.txt',msisdn_set)

        exported_dates.append((date,len(collection),len(msisdn_set)))
        
    
    print 'lacked_dates:',lacked_dates
    print 'exported_dates',exported_dates
Esempio n. 10
0
        """ % (
            table,
            oem_name,
            category,
            key,
            sub_key,
            date_start,
            date_end,
        )

        collection_ids = helper_mysql.fetch_set(sql)

        # print collection_ids
        result_set = set([])
        for id in collection_ids:
            temp_set = helper_mysql.get_raw_collection_by_id(int(id))
            result_set |= temp_set

        content = "\n".join(sorted([str(i) for i in result_set]))

    sys.stdout = temp_stdout
    print "Content-type: application/x-unknown"
    print "Content-Disposition: attachment; filename=data.txt"
    print "Content-Length: " + str(len(content))
    print ""
    print content

    pass
except:
    content = r"""
    This download looks out of date. PLease refresh the statistics page and try download again.