def check_stat_plan_log_file_reading_completion(stat_plan): """ self.log_table_name='raw_data_monitor' self.log_oem_name=self.script_file_name self.log_category=helper_ip.get_current_server_ip()+'_'+self.start_time_str.replace(' ','_').replace(':','_').replace('-','_')+'_'+self.uuid helper_mysql.put_raw_data(oem_name=self.log_oem_name, \ category=self.log_category, \ key='original_file_size', \ sub_key=log_file_name, \ value=helper_file.get_file_size(log_file_name), \ table_name=self.log_table_name) """ # 1.check total log file number current_date=helper_regex.extract(stat_plan.log_category,r'_(\d{4}_\d{2}_\d{2})_').replace('_','-') previous_date=helper_regex.date_add(current_date,-1) previoud_date_category_like=helper_regex.extract(stat_plan.log_category.replace(current_date,previous_date),r'([\d\.]+_\d{4}_\d{2}_\d{2})') sql=r''' select (select count(distinct sub_key) from raw_data_monitor where oem_name='%s' and category='%s') - (select count(distinct sub_key) from raw_data_monitor where oem_name='%s' and category=( select count(distinct sub_key) from raw_data_monitor where oem_name='%s' and category like '%s%%' )) ''' % (stat_plan.log_oem_name,stat_plan.log_category,stat_plan.log_oem_name,stat_plan.log_oem_name,previoud_date_category_like) print sql distance=helper_mysql.get_one_value_string(sql) print distance return distance
def get_http_response_size(full_file_path): try: #print len(urllib.urlopen(full_file_path).info()) #print str(urllib.urlopen(full_file_path).info()) length=int(helper_regex.extract(str(urllib.urlopen(full_file_path).info()),r'Content\-Length:\s*(\d+)')) except: length=-1 return length
def get_sub_dir_list_from_dir(base_path=os.curdir,name_pattern='(.)'): base_path=base_path.rstrip('/').rstrip('\\') result_sub_dir_list=[] dirs = [name for name in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, name))] for d in dirs: if helper_regex.extract(os.path.join(base_path, d),name_pattern): result_sub_dir_list.append(os.path.join(base_path, d)) return result_sub_dir_list
def get_filtered_file_list_from_dir_tree(base_path=os.curdir,name_pattern='(.)'): base_path=base_path.rstrip('/').rstrip('\\') result_file_list=[] for path, dirs, files in os.walk(os.path.abspath(base_path)): #print path, dirs, files for file_name in files: if helper_regex.extract(path+r'\\'+file_name,name_pattern): result_file_list.append(os.path.join(path, file_name)) return result_file_list
def prepare_directory_level(file_full_name,root_dir=config.conn_stat_portal['collection_root_dir'],step=3): file_name=helper_regex.extract(file_full_name,r'^\s*(.*?)(?:\.\w+)$') path=root_dir if not file_name: return path directory_levels=[file_name[i:i+step] for i in range(0, len(file_name)-step, step)] for level in directory_levels: path+='\\'+level if not os.path.exists(path): os.makedirs(path) return path
def do_update(): script_path='E:\\AppServ\\www\\xstat\\htdocs\\subpages\\view.php' script_content=helper_file.get_content_from_file(script_path) old_token=helper_regex.extract(script_content,r'token=(\w+)') url=helper_regex.extract(script_content,r'href="(http://192.168.0.158:81/phpmyadmin-2[^"]+)"').replace("<?=$_PAGE['view_id']?>",'150') page_content=helper_file.get_http_content(url) new_token=helper_regex.extract(page_content,r'token=(\w+)') new_token='da1b5116e305194ca8fd7806df008453' script_content=script_content.replace(old_token,new_token) #print page_content helper_file.put_content_to_file(script_content,script_path) print old_token,'to',new_token url=url.replace(old_token,new_token) helper_file.get_http_content(url) script_path='E:\\AppServ\\www\\xstat\\htdocs\\subpages\\view.php' script_content=helper_file.get_content_from_file(script_path) current_token=helper_regex.extract(script_content,r'token=(\w+)') print current_token
def filter_and_count_distinct(list_obj,pattern='(.*)',ignore_empty=True): count=0 count_distinct=0 count_distinct_dict={} for i in list_obj: flag=helper_regex.extract(str(i),pattern) if ignore_empty and not flag: continue if not count_distinct_dict.has_key(flag): count_distinct_dict[flag]=0 count_distinct_dict[flag]+=1 count+=1 count_distinct=len(count_distinct_dict) avg=0 if count_distinct>0: avg=1.0*count/count_distinct return (count,count_distinct,avg,count_distinct_dict)
def calculate_count_distinct_named_collection(date_unit,oem_name,category,key,sub_key,date,table_name='raw_data',allow_collection_empty=False): #date_unit accepts 1,2,3,...,'weekly','monthly' #for weekly, it produces result only when date is Sunday, else 0 #for monthly, it produces result only when date is the last day of a week, else 0 #for all cases, it doesn't produce value when required collections are not all ready unique=0 total=0 average=0 if not date: return unique,total,average if date_unit=='weekly': if helper_regex.get_weekday_from_date_str(date)!=7: return unique,total,average date_unit=7 elif date_unit=='monthly': if helper_regex.extract(helper_regex.date_add(date,1),r'\d+\-\d+\-(\d+)')!='01': return unique,total,average first_date=helper_regex.extract(date,r'(\d+\-\d+\-)\d+')+'01' date_unit=helper_regex.get_day_diff_from_date_str(date,first_date)+1 if date_unit<1: date_unit=1 key=key.replace('_collection_id','') sql=_get_sql_select_collection_id_by_date(oem_name,category,key,sub_key,table_name) collection_id_dict=helper_mysql.fetch_dict(sql) key_temp=collection_id_dict.keys() key_temp.sort(reverse=True) sql=_get_sql_select_collection_id_by_date(oem_name,category,key+'_base',sub_key,table_name) #print sql collection_base_dict=helper_mysql.fetch_dict(sql) #print collection_base_dict """ print 'existing collection list:' for i in key_temp[0:65]: print i+': '+str(collection_id_dict[i]) """ col_1=set([]) base_total=0 for i in range(0,date_unit): date_temp=helper_regex.date_add(date,-i) col_id_temp=collection_id_dict[date_temp] if collection_id_dict.has_key(date_temp) else 0 #col_temp=helper_mysql.get_raw_collection_by_id(col_id_temp) col_temp=helper_collection.get_named_collection(table_name=table_name,oem_name=oem_name,category=category, \ key=key,sub_key=sub_key,date=date_temp) col_1 |= col_temp base_total+=int(collection_base_dict[date_temp]) if collection_base_dict.has_key(date_temp) else 0 if col_id_temp==0: #force return null when data not complete if allow_collection_empty: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! passed.' else: print date_temp,table_name,oem_name,category,key,sub_key,date_temp,'collection empty error! exit.' return unique,total,average unique=len(col_1) total=base_total average=base_total*1.0/unique if unique>0 else 0 return unique,total,average
def process_line(line='',exist='',group_key=''): shared_folder_dir=helper_regex.extract('(\\{2,}\d+\.\d+\.\d+\.\d+\\+(\w+\\+)*)',line) print shared_folder_dir
def _is_in_dict_keys(line,field_def='',the_dict={}): if callable(field_def): return the_dict.has_key(field_def(line)) else: return the_dict.has_key(helper_regex.extract(line,field_def))
def zip_file_to_storage(source_file_smb_path,storage_root): target_path=storage_root.rstrip('\\')+'\\'+helper_regex.extract(source_file_smb_path,'\\(\\(?:[^\\]+\\)*)') file_name=helper_regex.extract(source_file_smb_path,'([^\\]+)$') helper_file.prepare_directory_on_windows(target_path)
found_sqls=[] #re_key_sql=re.compile(r'SQL1:.*?\nSQL2:.*?\n') re_key_sql=re.compile(r'SQL:.*?\n') files=glob.glob(filePath) for f in files: file=open(f,'r',1024*1024) content=file.read(-1) print 'file: '+f+' ('+str(len(content))+')' m=re.findall(re_key_sql,content) if m: for i in m: #sql_delete=helper_regex.extract(i,r'SQL1:(.*?)\n').replace('raw_data_test','raw_data_debug') sql_insert=helper_regex.extract(i,r'SQL:(.*?)\n')#.replace('raw_data_test','raw_data_debug') #found_sqls.append(sql_delete) found_sqls.append(sql_insert) file.close() for k in found_sqls: print k+';'
def export(date_length=30): user_login_history={} user_last_login_date={} today=helper_regex.date_add(helper_regex.get_date_str_now(),-17) start_time=helper_regex.date_add(today,-date_length)+' 05:00:00' end_time=helper_regex.date_add(today,-1)+' 05:00:00' # user_id -> msisdn sql=r''' SELECT [user_id],replace([user_name],'@shabik.com','') as msisdn FROM [mozone_user].[dbo].[Profile] with(nolock) where [creationDate]>='%s' and [creationDate]<'%s' and user_name like '%%shabik.com%%' ''' % (start_time,end_time) user_id_to_msisdn=helper_sql_server.fetch_dict(conn_config=config.conn_stc,sql=sql) # new user user_id new_user_collection=user_id_to_msisdn.keys() new_user_collection=set([str(user_id) for user_id in new_user_collection]) # subscription status sql=r''' select distinct '0'+replace(msisdn,'+966','')+'@shabik.com' as [user_name] into #tmp from db86.shabik_mt.dbo.accounts with(nolock) where is_deleted=0 SELECT [user_id] FROM [mozone_user].[dbo].[Profile] with(nolock) where [creationDate]>='%s' and [creationDate]<'%s' and user_name like '%%shabik.com%%' and user_name in ( select user_name from #tmp ) drop table #tmp ''' % (start_time,end_time) user_id_in_sub=helper_sql_server.fetch_set(conn_config=config.conn_stc,sql=sql) user_id_in_sub=set([str(user_id) for user_id in user_id_in_sub]) for i in range(date_length,-17,-1): date_temp=helper_regex.date_add(today,-i) shabik_5_collection=helper_mysql.get_raw_collection_from_key(oem_name='STC',category='moagent', \ key='app_page_only_shabik_5_daily_visitor_unique',sub_key='', \ date=date_temp,table_name='raw_data',db_conn=None) shabik_5_collection=shabik_5_collection & new_user_collection for user_id in shabik_5_collection: user_login_history.setdefault(user_id,'') user_login_history[user_id]+='5' user_last_login_date.setdefault(user_id,'') user_last_login_date[user_id]=date_temp shabik_360_collection=helper_mysql.get_raw_collection_from_key(oem_name='Shabik_360',category='moagent', \ key='app_page_daily_visitor_unique',sub_key='', \ date=date_temp,table_name='raw_data_shabik_360',db_conn=None) shabik_360_collection=shabik_360_collection & new_user_collection for user_id in shabik_360_collection: user_login_history.setdefault(user_id,'') user_login_history[user_id]+='6' user_last_login_date.setdefault(user_id,'') user_last_login_date[user_id]=date_temp #calculate """ target_groups_names=[ '1.More than 2 weeks users using Shabik 360 (Totally New User to Shabik) [only using 360]', '2.Users who Shifted from Shabik360 to Shabik 5 [for each at least using 3 days, still in sub]', '3.Unsubscribed users of Shabik 360 [last using 360 for >=7 days and then unsub]', '4.Users who uses Shabik 5 more than 2 weeks [actually is online for >=14 days]', '5.Users who shifted from Shabik 5 to Shabik 360 [for each at least using 3 days, still in sub]', '6.User base of new user in last 50 days, which is used to generate above lists', ] target_groups=[ [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(6{14,})$')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(6{3,}5{3,}$)')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(6{7,}$)') and user_id in user_id_in_sub], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(5{14,}$)')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(5{3,}6{3,}$)') and user_id in user_id_in_sub], [user_id for user_id,sequence in user_login_history.iteritems()], ] target_groups_names={ 'User only use Shabik 360': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(6+)$')], 'User only use Shabik 360 [more than 10d]': , 'User only use Shabik 5', 'User only use Shabik 5 [more than 10d]', 'User use both Shabik 360 / Shabik 5', 'User used both and choosed Shabik 5 [recently used only Shabik 5 for 5d]', 'User used both and choosed Shabik 5 [recently used only Shabik 360 for 5d]', } target_groups=[ [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(6{10,})$')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(5+)$')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(5{10,})$')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)') and helper_regex.extract(sequence,r'(5{5,})$')], [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)') and helper_regex.extract(sequence,r'(6{5,})$')], ] """ threshold_of_settle_down='5' target_groups={ '1.new_user': [user_id for user_id,sequence in user_login_history.iteritems()], '2.new_user_start_from_5': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(5)')], '3.new_user_start_from_360': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(6)')], '4.new_user_only_5': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(5+)$')], '5.new_user_only_360': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'^(6+)$')], '6.new_user_both': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)')], '7.new_user_both_and_finally_5': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)') and helper_regex.extract(sequence,'(5{'+threshold_of_settle_down+',})$')], '8.new_user_both_and_finally_360': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)') and helper_regex.extract(sequence,'(6{'+threshold_of_settle_down+',})$')], '9.new_user_both_and_not_stable': [user_id for user_id,sequence in user_login_history.iteritems() if helper_regex.extract(sequence,r'(56|65)') and not helper_regex.extract(sequence,'(5{'+threshold_of_settle_down+',})$') and not helper_regex.extract(sequence,'(6{'+threshold_of_settle_down+',})$')], } #export keys=sorted(target_groups.keys()) for key in keys: user_id_collection=target_groups[key] print key print 'size:',len(user_id_collection) print '[last login date - msisdn - sub status - login history]' user_id_collection.sort(key=lambda user_id:user_last_login_date[user_id],reverse=True) for user_id in user_id_collection: print user_last_login_date[user_id],'\t',user_id_to_msisdn[user_id],'\t','sub' if user_id in user_id_in_sub else 'unsub','\t',user_login_history[user_id] for key in keys: user_id_collection=target_groups[key] print '==',key,'==' print 'size:',len(user_id_collection) print 'unsub:',len([user_id for user_id in user_id_collection if not user_id in user_id_in_sub]) """
filePath=r'E:\RoutineScripts\log\daily_all_login_service.py.2010-08-05.log' found_keys={} re_key_content=re.compile(r'where .*? limit 1') re_key=re.compile(r'where (.*?`key`=".*?")') files=glob.glob(filePath) for f in files: file=open(f,'r',1024*1024) content=file.read(-1) print 'file: '+f+' ('+str(len(content))+')' m=re.findall(re_key_content,content) if m: for i in m: k=helper_regex.extract(i,re_key) if not found_keys.has_key(k): found_keys[k]=0 found_keys[k]+=1 file.close() keys=found_keys.keys() keys.sort() for k in keys: print k
def get_country_name(line): ip=helper_regex.extract(line,r'(\d+\.\d+\.\d+\.\d+)') if not ip: return 'ip_empty' return helper_ip.get_country_code_from_ip(ip)