def process(my_date):

    global whole_collection, element_existance_counters, sql_collection_id

    current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d')
    
    collection_ids = helper_mysql.fetch_dict(sql_collection_id % (current_date,))

    for app,collection_id in collection_ids.iteritems():

        collection=helper_mysql.get_raw_collection_by_id(collection_id)

        if not collection_id or not collection:
            print 'No collection of',current_date,collection_id
            continue
        
        if not whole_collection.has_key(app):
            whole_collection[app]=set([])

        whole_collection[app]|=collection

        for i in collection:
            if not element_existance_counters.has_key(i):
                element_existance_counters[i]={}

            if not element_existance_counters[i].has_key(app):
                element_existance_counters[i][app]=0

            element_existance_counters[i][app]+=1
def process(client_type):

    global whole_collection, element_existance_counters, sql_collection_id, client_dict

    current_collection={}

    type_number=client_number_dict[client_type]
  
    collection_ids = helper_mysql.fetch_dict(sql_collection_id % (client_type,))

    print collection_ids

    for date,collection_id in collection_ids.iteritems():

        collection=helper_mysql.get_raw_collection_by_id(collection_id)

        if not collection_id or not collection:
            print 'No collection of',collection_id
            continue
        
        whole_collection|=collection

        for i in collection:
            current_collection[i]=type_number

    print len(current_collection)

    field_name='client-type'

    sql=r'''
    
    update mozat_clustering.user_figure_base
    set `%s`='%s'
    where `oem_id`=7
    and `user_id`='%s'
    limit 1

    '''

    for user_id, t in current_collection.iteritems():
        helper_mysql.execute(sql % (field_name,t,user_id))
def get_cached_sql_result_as_dict(sql,db_conn):
    if db_conn['db_type'] not in ('sql_server','mysql'):
        raise Error('db_type error')
    unique_key=helper_math.md5(','.join(sorted(db_conn.values()))+sql)

    cached_result=helper_mysql.get_dict_of_raw_collection_from_key(oem_name='',category='', key='',\
                                sub_key=unique_key,date='',table_name='raw_data_cache_sql_result',db_conn=None)

    if cached_result:
        print 'get from cache:',unique_key
        return cached_result
    
    if db_conn['db_type']=='sql_server':
        cached_result=helper_sql_server.fetch_dict(conn_config=db_conn,sql=sql)
    else:
        cached_result=helper_mysql.fetch_dict(db_conn=db_conn,sql=sql)

    helper_mysql.put_collection_with_value(collection=cached_result,oem_name='',category='',key='',sub_key=unique_key, \
                                table_name='raw_data_cache_sql_result',date='',created_on=None,db_conn=None)
    
    print 'push into cache:',unique_key
    return cached_result
def process():

    global sql_collection_id

    current_date=datetime.fromtimestamp(my_date).strftime('%Y-%m-%d')

    info_dict=helper_mysql.fetch_dict(sql_collection_id)
    
    print len(info_dict)

    sql=r'''
    
    update mozat_clustering.user_figure_base
    set `%s`='%s'
    where `oem_id`=7
    and `user_id`='%s'
    limit 1

    '''

    field_name='client-type'

    for user_id,value in info_dict:
        helper_mysql.execute(sql % (field_name,value,user_id))
Example #5
0
         % (db_name,oem_name,category,key,sub_key,value,date))

print 'updated value:'+str(helper_mysql.get_one_value_int(r'''
select `value` from `%s` where oem_name='%s' and category='%s' and `key`='%s' and `date`='%s'
''' % (db_name,oem_name,category,key,date)))




exit()

for i in range(1,100,1):
    sql=r'''
    
    select `sub_key`,`value`
    from raw_data_url_pattern
    where `oem_name`='STC' and `category`='moagent' and `key`='app_page_by_url_pattern_daily_visitor_unique'
    and `date`="%s"
    and `sub_key` like '%%jit%%'
    order by date desc;
        
    ''' % (helper_regex.date_add('2011-06-01',i),)

    #print sql
    print helper_regex.date_add('2011-07-01',i)
    result=helper_mysql.fetch_dict(sql)
    for k,v in result.iteritems():
        print k,v
        

exit()
Example #6
0
def calculate_date_range_retain_rate(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'):

    if date_unit<1:
        date_unit=1        
        
    base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1
    retained_base_size,lost_base_size,fresh_base_size=0,0,0
    
    if not date:
        return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
    
    #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \
    # %(table_name,oem_name,category,key,sub_key)

    key=key.replace('_collection_id','')+'_collection_id'
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)
    
    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_1 |= col_temp

        if col_id_temp==0: #force return null when data not complete
            return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size

    base_size=len(col_1)
    
    col_2=set([])
    for i in range(0+date_unit,date_unit+date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_2 |= col_temp

    retain=col_1 & col_2
    fresh=col_1 - col_2
    lost=col_2 - col_1
    
    """
    print str(col_1)
    print str(col_2)
    print str(retain)
    print str(fresh)
    print str(lost)
    """

    retained_base_size,lost_base_size,fresh_base_size=len(retain),len(lost),len(fresh)
    
    if len(col_2)>0:
        retain_rate=1.0*len(retain)/len(col_2)
        
    if len(col_1)>0 and len(col_2)>0:
        fresh_rate=1.0*len(fresh)/len(col_1)
        
    if len(col_2)>0:
        lost_rate=1.0*len(lost)/len(col_2)
        
    return base_size,retain_rate,fresh_rate,lost_rate,retained_base_size,lost_base_size,fresh_base_size
Example #7
0
def calculate_count_distinct_named_collection(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False):
    
    #date_unit accepts 1,2,3,...,'weekly','monthly'
    #for weekly, it produces result only when date is Sunday, else 0
    #for monthly, it produces result only when date is the last day of a week, else 0
    #for all cases, it doesn't produce value when required collections are not all ready

    unique=0
    total=0
    average=0

    if not date:
        return unique,total,average

    if date_unit=='weekly':
        if helper_regex.get_weekday_from_date_str(date)!=7:
            return unique,total,average
        date_unit=7

    elif date_unit=='monthly':
        if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01':
            return unique,total,average

        first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01'
        date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1

    if date_unit<1:
        date_unit=1        

    key=key.replace('_collection_id','')
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)

    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)


    sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name)
    
    #print sql
    collection_base_dict=helper_mysql.fetch_dict(sql)
    #print collection_base_dict

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    base_total=0
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)

        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        #col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)

        col_temp=helper_collection.get_named_collection(table_name=table_name,oem_name=oem_name,category=category, \
                                                        key=key,sub_key=sub_key,date=date_temp)
        col_1 |= col_temp

        base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0
        
        if col_id_temp==0: #force return null when data not complete
            if allow_collection_empty:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.'
            else:
                print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.'
                return unique,total,average

    unique=len(col_1)
    total=base_total
    average=base_total*1.0/unique if unique>0 else 0

    return unique,total,average
Example #8
0
def calculate_date_range_average_life_cycle(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data'):
    if date_unit<1:
        date_unit=1        
        
    #base_size,retain_rate,fresh_rate,lost_rate=0,0,1,1
    lost_col_average_life_cycle=0
    retained_col_average_life_cycle=0
    
    if not date:
        return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{}
        #return base_size,retain_rate,fresh_rate,lost_rate
    
    #sql=r"select `date`,`value` from `%s` where `oem_name`='%s' and `category`='%s' and `key`='%s' and `sub_key`='%s'" \
    # %(table_name,oem_name,category,key,sub_key)
    
    key=key.replace('_collection_id','')+'_collection_id'
    sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name)

    collection_id_dict=helper_mysql.fetch_dict(sql)
    
    key_temp=collection_id_dict.keys()
    key_temp.sort(reverse=True)

    """
    print 'existing collection list:'    
    for i in key_temp[0:65]:
        print i+': '+str(collection_id_dict[i])
    """

    col_1=set([])
    for i in range(0,date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_1 |= col_temp
        
        if col_id_temp==0: #force return null when data not complete
            return lost_col_average_life_cycle,retained_col_average_life_cycle,{},{}
    
    base_size=len(col_1)
    
    col_2=set([])
    for i in range(0+date_unit,date_unit+date_unit):
        date_temp=helper_regex.date_add(date,-i)
        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)
        col_2 |= col_temp

    
    lost_col=col_2 - col_1
    retained_col=col_2 & col_1

    lost_col_len=len(lost_col)
    retained_col_len=len(retained_col)

    lost_col_dict=dict([(k, 0) for k in lost_col])
    retained_col_dict=dict([(k, 0) for k in retained_col])

    for i in range(0,2000):
        date_temp=helper_regex.date_add(date,-i)
        
        if date_temp=='2010-01-01':
            break

        col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0
        col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp)

        for i in col_temp:
            if lost_col_dict.has_key(i):
                lost_col_dict[i]+=1
            if retained_col_dict.has_key(i):
                retained_col_dict[i]+=1

    if lost_col_len>0:
        lost_col_average_life_cycle=sum(lost_col_dict.values())*1.0/lost_col_len

    if retained_col_len>0:
        retained_col_average_life_cycle=sum(retained_col_dict.values())*1.0/retained_col_len

    return lost_col_average_life_cycle,retained_col_average_life_cycle,lost_col_dict,retained_col_dict

#fetch collection id

sql=r'''
    select `date`,`value`
    from `%s`
    where `oem_name`='%s'
    and `category`='%s'
    and `key`='%s'
    and `sub_key`='%s'
    and `date`>='%s'
    and `date`<='%s'
''' % (db_name,oem_name,category,key,sub_key,date_begin,date_end)

target_collection_ids = helper_mysql.fetch_dict(sql)

sql=r'''
    select `date`,`value`
    from `%s`
    where `oem_name`='%s'
    and `category`='%s'
    and `key`='%s'
    and `sub_key`='%s'
    and `date`!=''
''' % (db_name,oem_name,category,key,sub_key)

base_collection_ids = helper_mysql.fetch_dict(sql)


#target collection
Example #10
0
def export(target_key):
    
    lacked_dates=[]
    exported_dates=[]

    dir_name='.\\export_collection\\'+'_'.join(target_key).strip('_')

    collection_ids=helper_mysql.fetch_dict(sql=r'''
    
    select `date`,`value`
    from `%s`
    where 
    `oem_name`='%s'
    and `category`='%s'
    and `key`='%s'
    and `sub_key`='%s'
    order by date desc
    limit 200
    
    ''' % (target_key[0],target_key[1],target_key[2],target_key[3],target_key[4],))

    helper_file.prepare_directory_on_windows(dir_name)

    print 'collection_ids:',len(collection_ids) 


    # load user_id->msisdn mapping


    user_id_to_msisdn={}
    file_handler=open('E:\\WebStatShare\\vodafone_user_id_to_msisdn.csv',"r")
    for line in file_handler:
        line=line.strip(os.linesep).strip()
        #print line
        if not line:
            continue
        
        c=line.find(',')
        if c==-1:
            continue
        msisdn=line[c+1:].strip('X').strip()
        if msisdn.isdigit():
            user_id_to_msisdn[int(line[0:c])]=int(msisdn)

    file_handler.close()

    print 'user_id_to_msisdn:',len(user_id_to_msisdn)



    for date,collection_id in collection_ids.iteritems():
        
        collection=helper_mysql.get_raw_collection_by_id(collection_id)
        if not collection:
            lacked_dates.append(date)
            continue
        
        msisdn_set=set([])

        for i in collection:
            if not i.isdigit():
                continue
            user_id=int(i)
            
            if user_id_to_msisdn.has_key(user_id):
                msisdn_set.add(user_id_to_msisdn[user_id])
                #print user_id,user_id_to_msisdn[user_id]
                #continue
            #print user_id
        
        helper_file.write_big_string_set_to_file(dir_name+'\\'+date+'.txt',msisdn_set)

        exported_dates.append((date,len(collection),len(msisdn_set)))
        
    
    print 'lacked_dates:',lacked_dates
    print 'exported_dates',exported_dates