Example #1
0
class prechecker:
    
    def __init__(self):
        self.filter = filter()
        self.db_helper = DBHelper()
        self.cmp_table = 'refined_list_info'
        self.table = 'extracted_info'
        self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
    
    def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old):
        new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True)
        old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True)
        old_dict = {}
        
        for entry in old_list:
            url = entry.get(ORIGIN_URL)
            if url is not None:
                old_dict[url] = None
        updates = []
        for entry in new_list:
            url = entry.get(ORIGIN_URL)
            if url in old_dict:
                continue
            else:
                updates.append(entry)
                old_dict[url] = None
        fields = entry.keys()
        self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields)
        return updates


    def repair_data(self,entrys,cmp_entrys=None):
        if cmp_entrys == None:
            cmp_entrys = self.cmp_entrys
        LOG.info("repairing Data...")
        LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
        cmple_info_dict = collections.defaultdict(dict)
        for entry in cmp_entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url != None:
                cmple_info_dict[origin_url].update(entry)
        for entry in entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url in cmple_info_dict:
                for clm in cmple_info_dict[origin_url]:
                    value = entry.get(clm)
                    if value is None:
                        new_value = cmple_info_dict[origin_url][clm]
                        value = new_value
                        entry[clm] = value
        return entrys

    def pre_process(self):
        
        self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1)
        cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True)
        entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True)
        entrys = self.repair_data(entrys,cmp_entrys)
        entrys = self.filter.rm_college_from_loc(entrys)
        self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table))
        if len(entrys) > 0 :
            fields = entrys[0].keys()
            self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
Example #2
0
class statist():
 
    def __init__(self):
        self.load_dicts()
        self.db = DBHelper()
    
    def load_dicts(self):
        #load domain_college_map
        file_path = 'domain_college'
        self.domain_college_dict = self.load_file_as_dict(file_path)
        
        #load domain_college_map
        file_path = 'sitename_college'
        self.site_college_dict = self.load_file_as_dict(file_path)
        
        #load college_list
        file_path = 'college_list'
        self.college_list = self.load_file_as_list(file_path)
        
    
    def  build_crawler_stat(self):
        #load crawed_stats
        file_path = 'crawler_stat'
        rows = self.load_file_as_list(file_path)
        heads = ['College','Request','Response','Suc_Rate','Uniq_Url']
        entrys = []
        rq_sum = 0 
        rp_sum = 0
        uniq_url_sum = 0
        for row in  rows:
            #TODO try catch the index exceed the row length
            entry = []
            dm = row[0]
            rq = row[1]
            rp = row[2]
            uniq_url = row[3]

            rq_sum += int(rq)
            rp_sum += int(rp)
            uniq_url_sum += int(uniq_url)
            
            college = self.domain_college_dict.get(dm)
            entry.append(college)
            entry.append(rq)
            entry.append(rp)
            #zero can not be divided
            if rq == '0':
                suc_rate = str(100.00) +"%"
            else:
                #keep two digits 
                suc_rate =str(round(float(rp)/float(rq)*100,2)) + "%"
            
            entry.append(suc_rate)
            entry.append(uniq_url)
            entrys.append(entry)
        
        if rq_sum == '0':
            suc_rate_aver = str(100.00) + "%s"
        else:
            suc_rate_aver = str(round(float(rp_sum)/float(rq_sum)*100,2)) + "%s"
        
        sum_entry = ['SUM',str(rq_sum),str(rp_sum),suc_rate_aver,str(uniq_url_sum)]
        entrys.append(sum_entry)
        return heads,entrys
    
    def build_parse_stat(self):
        # need parameter db,table and where clause
        tb_full_today = 'extracted_full_info_today'
        tb_today = 'refined_info_today'
        
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_full_today,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        where_clause = 'where info_type = 1 and isnull(meeting_time)'
        sql = sql_tmp%(DB_1,tb_full_today,where_clause)
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)

        # get sitename and error recruit info num
        where_clause = 'where info_type = 0 and isnull(release_date)'
        sql = sql_tmp%(DB_1,tb_full_today,where_clause)
        rows = self.db.exe_sql(sql)
        dict_3 = dict(rows)
        dl.append(dict_3)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []
        site_tt_sum = 0
        err_1_sum = 0
        err_2_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            site_tt = row[1]
            err_1 = row[2]
            err_2 = row[3]

            site_tt_sum += site_tt
            err_1_sum += err_1
            err_2_sum += err_2

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(site_tt)
            entry.append(err_1)
            entry.append(err_2)
            # no need to prevent site_tt_num as zero :impossible
            suc_rate = str(round((1 - float(err_1 + err_2) / float(site_tt)) * 100,2)) + "%"
            entry.append(suc_rate)
            entrys.append(entry)
        # get sum info
        suc_rate_tt = str(round((1 - float(err_1_sum + err_2_sum) / float(site_tt_sum)) * 100,2)) + "%"
        sum_entry = ['SUM',site_tt_sum,err_1_sum,err_2_sum,suc_rate_tt]
        entrys.append(sum_entry)
        heads = ['College','Parse Info','Err Campus Info','Err Recruit Info','Suc_Rate']
        return heads,entrys
       
    def build_duplicates_stat(self):
        # need parameter db,table and where clause
        tb_full_today = 'extracted_full_info_today'
        tb_today = 'refined_info_today'
        
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_full_today,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        sql = sql_tmp%(DB_1,tb_today,' ')
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []

        full_sum = 0
        refined_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            full_tt = row[1]
            refined = row[2]

            full_sum += full_tt
            refined_sum += refined

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(full_tt)
            entry.append(refined)
            # no need to prevent site_tt_num as zero :impossible
            suc_rate = str(round(float(refined) / float(full_tt) * 100,2)) + "%"
            entry.append(suc_rate)
            entrys.append(entry)
        # get sum info
        suc_rate_tt = str(round(float(refined_sum) / float(full_sum) * 100,2)) + "%"
        sum_entry = ['SUM',full_sum,refined_sum,suc_rate_tt]
        entrys.append(sum_entry)
        heads = ['College','Full Info','Refined Info','Uniqs_Prop']
        return heads,entrys
    
    def build_total_stat(self):
        # need parameter db,table and where clause
        
        tb_refined = 'refined_info'
        sql_tmp = "select origin_website_name,count(*) from %s.%s %s group by origin_website_name"
        dl = []
        # get sitename and parsenum dict
        sql = sql_tmp%(DB_1,tb_refined,' ')
        rows = self.db.exe_sql(sql)
        dict_1 = dict(rows)
        dl.append(dict_1)

        #get sitename and error campus info num
        where_clause = "where info_type = 1"
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_2 = dict(rows)
        dl.append(dict_2)
        
        #get sitename and error campus info num
        where_clause = "where info_type = 0"
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_3 = dict(rows)
        dl.append(dict_3)

        
        #get sitename and error campus info num
        date_str = datetime.datetime.now().strftime('%Y-%m-%d')
        where_clause = "where info_type = 1 and meeting_time > '%s'"%date_str
        sql = sql_tmp%(DB_1,tb_refined, where_clause)
        rows = self.db.exe_sql(sql)
        dict_4 = dict(rows)
        dl.append(dict_4)

        #merge the all dicts as a list based on key:sitename
        
        rows = []
        for k in dl[0].keys():
            row = []
            row.append(k)
            for d in dl:
                if d.get(k) is None:
                    row.append(0)
                else:
                    row.append(d.get(k))
            rows.append(row)
        entrys = []

        tt_sum = 0
        cam_tt_sum = 0
        rec_tt_sum = 0
        fut_cam_tt_sum = 0

        for row in rows:
            entry = []
        #TODO exception
            site = row[0]
            tt = row[1]
            cam_tt = row[2]
            rec_tt = row[3]
            fut_cam_tt = row[4]

            tt_sum += tt
            cam_tt_sum += cam_tt
            rec_tt_sum += rec_tt
            fut_cam_tt_sum += fut_cam_tt

            college = self.site_college_dict.get(site)
            entry.append(college)
            entry.append(tt)
            entry.append(cam_tt)
            entry.append(rec_tt)
            entry.append(fut_cam_tt)
            entrys.append(entry)
        # get sum info
        sum_entry = ['SUM',tt_sum,cam_tt_sum,rec_tt_sum,fut_cam_tt_sum]
        entrys.append(sum_entry)
        heads = ['College','Total_Info','Campus_Info','Recruit_Info','Future_Info']
        return heads,entrys
    
    def to_dict(self,rows):
        data = [(row[0][0],row[0][1]) for row in rows if len (row[0]) > 1 ]
        return zip(data)

    def load_file_as_dict(self,file_path):
        map  = {}
        file = codecs.open(file_path,'r','utf-8')
        for line in file.readlines():
            line = line.split('\t')
            key = line[0].strip()
            value = line[1].strip()
            map[key] = value
        file.close()
        return map
    
    def load_file_as_list(self,file_path):
        li  = []
        file = codecs.open(file_path,'r','utf-8')
        for line in file.readlines():
            li.append(line.strip().split('\t'))
        file.close()
        return li