class prechecker: def __init__(self): self.filter = filter() self.db_helper = DBHelper() self.cmp_table = 'refined_list_info' self.table = 'extracted_info' self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE] def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old): new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True) old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True) old_dict = {} for entry in old_list: url = entry.get(ORIGIN_URL) if url is not None: old_dict[url] = None updates = [] for entry in new_list: url = entry.get(ORIGIN_URL) if url in old_dict: continue else: updates.append(entry) old_dict[url] = None fields = entry.keys() self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields) return updates def repair_data(self,entrys,cmp_entrys=None): if cmp_entrys == None: cmp_entrys = self.cmp_entrys LOG.info("repairing Data...") LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys))) cmple_info_dict = collections.defaultdict(dict) for entry in cmp_entrys: origin_url = entry.get(ORIGIN_URL) if origin_url != None: cmple_info_dict[origin_url].update(entry) for entry in entrys: origin_url = entry.get(ORIGIN_URL) if origin_url in cmple_info_dict: for clm in cmple_info_dict[origin_url]: value = entry.get(clm) if value is None: new_value = cmple_info_dict[origin_url][clm] value = new_value entry[clm] = value return entrys def pre_process(self): self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1) cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True) entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True) entrys = self.repair_data(entrys,cmp_entrys) entrys = self.filter.rm_college_from_loc(entrys) self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table)) if len(entrys) > 0 : fields = entrys[0].keys() self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
class statist(): def __init__(self): self.load_dicts() self.db = DBHelper() def load_dicts(self): #load domain_college_map file_path = 'domain_college' self.domain_college_dict = self.load_file_as_dict(file_path) #load domain_college_map file_path = 'sitename_college' self.site_college_dict = self.load_file_as_dict(file_path) #load college_list file_path = 'college_list' self.college_list = self.load_file_as_list(file_path) def build_crawler_stat(self): #load crawed_stats file_path = 'crawler_stat' rows = self.load_file_as_list(file_path) heads = ['College','Request','Response','Suc_Rate','Uniq_Url'] entrys = [] rq_sum = 0 rp_sum = 0 uniq_url_sum = 0 for row in rows: #TODO try catch the index exceed the row length entry = [] dm = row[0] rq = row[1] rp = row[2] uniq_url = row[3] rq_sum += int(rq) rp_sum += int(rp) uniq_url_sum += int(uniq_url) college = self.domain_college_dict.get(dm) entry.append(college) entry.append(rq) entry.append(rp) #zero can not be divided if rq == '0': suc_rate = str(100.00) +"%" else: #keep two digits suc_rate =str(round(float(rp)/float(rq)*100,2)) + "%" entry.append(suc_rate) entry.append(uniq_url) entrys.append(entry) if rq_sum == '0': suc_rate_aver = str(100.00) + "%s" else: suc_rate_aver = str(round(float(rp_sum)/float(rq_sum)*100,2)) + "%s" sum_entry = ['SUM',str(rq_sum),str(rp_sum),suc_rate_aver,str(uniq_url_sum)] entrys.append(sum_entry) return heads,entrys def build_parse_stat(self): # need parameter db,table and where clause tb_full_today = 'extracted_full_info_today' tb_today = 'refined_info_today' sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name" dl = [] # get sitename and parsenum dict sql = sql_tmp%(DB_1,tb_full_today,' ') rows = self.db.exe_sql(sql) dict_1 = dict(rows) dl.append(dict_1) #get sitename and error campus info num where_clause = 'where info_type = 1 and isnull(meeting_time)' sql = sql_tmp%(DB_1,tb_full_today,where_clause) rows = self.db.exe_sql(sql) dict_2 = dict(rows) dl.append(dict_2) # get sitename and error recruit info num where_clause = 'where info_type = 0 and isnull(release_date)' sql = sql_tmp%(DB_1,tb_full_today,where_clause) rows = self.db.exe_sql(sql) dict_3 = dict(rows) dl.append(dict_3) #merge the all dicts as a list based on key:sitename rows = [] for k in dl[0].keys(): row = [] row.append(k) for d in dl: if d.get(k) is None: row.append(0) else: row.append(d.get(k)) rows.append(row) entrys = [] site_tt_sum = 0 err_1_sum = 0 err_2_sum = 0 for row in rows: entry = [] #TODO exception site = row[0] site_tt = row[1] err_1 = row[2] err_2 = row[3] site_tt_sum += site_tt err_1_sum += err_1 err_2_sum += err_2 college = self.site_college_dict.get(site) entry.append(college) entry.append(site_tt) entry.append(err_1) entry.append(err_2) # no need to prevent site_tt_num as zero :impossible suc_rate = str(round((1 - float(err_1 + err_2) / float(site_tt)) * 100,2)) + "%" entry.append(suc_rate) entrys.append(entry) # get sum info suc_rate_tt = str(round((1 - float(err_1_sum + err_2_sum) / float(site_tt_sum)) * 100,2)) + "%" sum_entry = ['SUM',site_tt_sum,err_1_sum,err_2_sum,suc_rate_tt] entrys.append(sum_entry) heads = ['College','Parse Info','Err Campus Info','Err Recruit Info','Suc_Rate'] return heads,entrys def build_duplicates_stat(self): # need parameter db,table and where clause tb_full_today = 'extracted_full_info_today' tb_today = 'refined_info_today' sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name" dl = [] # get sitename and parsenum dict sql = sql_tmp%(DB_1,tb_full_today,' ') rows = self.db.exe_sql(sql) dict_1 = dict(rows) dl.append(dict_1) #get sitename and error campus info num sql = sql_tmp%(DB_1,tb_today,' ') rows = self.db.exe_sql(sql) dict_2 = dict(rows) dl.append(dict_2) #merge the all dicts as a list based on key:sitename rows = [] for k in dl[0].keys(): row = [] row.append(k) for d in dl: if d.get(k) is None: row.append(0) else: row.append(d.get(k)) rows.append(row) entrys = [] full_sum = 0 refined_sum = 0 for row in rows: entry = [] #TODO exception site = row[0] full_tt = row[1] refined = row[2] full_sum += full_tt refined_sum += refined college = self.site_college_dict.get(site) entry.append(college) entry.append(full_tt) entry.append(refined) # no need to prevent site_tt_num as zero :impossible suc_rate = str(round(float(refined) / float(full_tt) * 100,2)) + "%" entry.append(suc_rate) entrys.append(entry) # get sum info suc_rate_tt = str(round(float(refined_sum) / float(full_sum) * 100,2)) + "%" sum_entry = ['SUM',full_sum,refined_sum,suc_rate_tt] entrys.append(sum_entry) heads = ['College','Full Info','Refined Info','Uniqs_Prop'] return heads,entrys def build_total_stat(self): # need parameter db,table and where clause tb_refined = 'refined_info' sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name" dl = [] # get sitename and parsenum dict sql = sql_tmp%(DB_1,tb_refined,' ') rows = self.db.exe_sql(sql) dict_1 = dict(rows) dl.append(dict_1) #get sitename and error campus info num where_clause = "where info_type = 1" sql = sql_tmp%(DB_1,tb_refined, where_clause) rows = self.db.exe_sql(sql) dict_2 = dict(rows) dl.append(dict_2) #get sitename and error campus info num where_clause = "where info_type = 0" sql = sql_tmp%(DB_1,tb_refined, where_clause) rows = self.db.exe_sql(sql) dict_3 = dict(rows) dl.append(dict_3) #get sitename and error campus info num date_str = datetime.datetime.now().strftime('%Y-%m-%d') where_clause = "where info_type = 1 and meeting_time > '%s'"%date_str sql = sql_tmp%(DB_1,tb_refined, where_clause) rows = self.db.exe_sql(sql) dict_4 = dict(rows) dl.append(dict_4) #merge the all dicts as a list based on key:sitename rows = [] for k in dl[0].keys(): row = [] row.append(k) for d in dl: if d.get(k) is None: row.append(0) else: row.append(d.get(k)) rows.append(row) entrys = [] tt_sum = 0 cam_tt_sum = 0 rec_tt_sum = 0 fut_cam_tt_sum = 0 for row in rows: entry = [] #TODO exception site = row[0] tt = row[1] cam_tt = row[2] rec_tt = row[3] fut_cam_tt = row[4] tt_sum += tt cam_tt_sum += cam_tt rec_tt_sum += rec_tt fut_cam_tt_sum += fut_cam_tt college = self.site_college_dict.get(site) entry.append(college) entry.append(tt) entry.append(cam_tt) entry.append(rec_tt) entry.append(fut_cam_tt) entrys.append(entry) # get sum info sum_entry = ['SUM',tt_sum,cam_tt_sum,rec_tt_sum,fut_cam_tt_sum] entrys.append(sum_entry) heads = ['College','Total_Info','Campus_Info','Recruit_Info','Future_Info'] return heads,entrys def to_dict(self,rows): data = [(row[0][0],row[0][1]) for row in rows if len (row[0]) > 1 ] return zip(data) def load_file_as_dict(self,file_path): map = {} file = codecs.open(file_path,'r','utf-8') for line in file.readlines(): line = line.split('\t') key = line[0].strip() value = line[1].strip() map[key] = value file.close() return map def load_file_as_list(self,file_path): li = [] file = codecs.open(file_path,'r','utf-8') for line in file.readlines(): li.append(line.strip().split('\t')) file.close() return li