Example #1
0
class DBTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.db = None

    def test_get_max_group_id(self):
        res =self.db.get_max_group_id('refined_info_2','xyzb')
        res_2 =self.db.get_max_group_id('info_template','xyzb')
        print res
        print res_2

    def test_batch_insert(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:12056]
        table = 'info_tmp1'
        self.db.recr_table(table,DB_1,'info_template',DB_1);
        self.db.batch_insert(entrys,table,DB_1)

    def test_batch_get(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:2]
        for key, value in entrys[1].items():
            print key,value,type(value)
Example #2
0
class filter:
    def __init__(self):
        self.db_helper = DBHelper()
        pass
    
    def deal_entrys(self,entrys):
        entrys = self.filter_illegal(entrys)
        return entrys
        
    # 如果有些字段为空的话不合法,记录到特殊表中 
    def filter_illegal(self,entrys):
        LOG.info("Begin to Filter illegal entrys..")
        not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE]
        legal_entrys = []
        illegal_entrys = []
        while len(entrys) > 0:
            entry = entrys.pop()
            info_type = entry.get(INFO_TYPE)
            flag = True
            for field in not_null_fields:
                if entry.get(field) is None:
                    illegal_entrys.append(entry)
                    flag = False
                    break
            
            if info_type == 0 and flag:
                if entry.get(RELEASE_DATE) is None:
                    illegal_entrys.append(entry)
                    flag = False
            if info_type == 1 and flag:
                if entry.get(MEETING_TIME) is None:
                    illegal_entrys.append(entry)
                    flag = False
            if flag:legal_entrys.append(entry)
        LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys)))    
        db_illegal = DB_1
        table_illegal = TB_ILLEGAL
        LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal))
        fields = list(INFO_FIELDS)
        fields.remove(GROUP_ID)
        fields.extend(['author','tmp_path'])
        self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields)
        return legal_entrys
    
    def rm_college_from_loc(self,entrys):
        LOG.info("Dealing Location field to remove college from loc")
        count = 0
        for entry in entrys:
            info_type = entry.get(INFO_TYPE)
            if info_type == 0:
                continue
            college = entry.get(COLLEGE_NAME)
            loc = entry.get(MEETING_LOCATION)
            if loc.startswith(college):
                count += 1
                loc = loc.replace(college,'')
                entry[MEETING_LOCATION] = loc
        LOG.info("Removing [%s] College Name from meeting_location field!"%count)
        return entrys
Example #3
0
 def test_diff_tables(self):
     dc = datachecker()
     db_helper = DBHelper()
     new_table = 'extracted_full_info_today'
     old_table = 'info_tmp'
     old_db = DB_test
     new_db = DB_1
     db_helper.recr_table(old_table,old_db,'info_template',DB_1)
     entrys =dc.diff_tables(table_old=old_table,table_new=new_table,db_old=old_db,db_new=new_db)
     db_helper.batch_insert(entrys,table=old_table,db=old_db)
Example #4
0
class prechecker:
    
    def __init__(self):
        self.filter = filter()
        self.db_helper = DBHelper()
        self.cmp_table = 'refined_list_info'
        self.table = 'extracted_info'
        self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
    
    def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old):
        new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True)
        old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True)
        old_dict = {}
        
        for entry in old_list:
            url = entry.get(ORIGIN_URL)
            if url is not None:
                old_dict[url] = None
        updates = []
        for entry in new_list:
            url = entry.get(ORIGIN_URL)
            if url in old_dict:
                continue
            else:
                updates.append(entry)
                old_dict[url] = None
        fields = entry.keys()
        self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields)
        return updates


    def repair_data(self,entrys,cmp_entrys=None):
        if cmp_entrys == None:
            cmp_entrys = self.cmp_entrys
        LOG.info("repairing Data...")
        LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
        cmple_info_dict = collections.defaultdict(dict)
        for entry in cmp_entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url != None:
                cmple_info_dict[origin_url].update(entry)
        for entry in entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url in cmple_info_dict:
                for clm in cmple_info_dict[origin_url]:
                    value = entry.get(clm)
                    if value is None:
                        new_value = cmple_info_dict[origin_url][clm]
                        value = new_value
                        entry[clm] = value
        return entrys

    def pre_process(self):
        
        self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1)
        cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True)
        entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True)
        entrys = self.repair_data(entrys,cmp_entrys)
        entrys = self.filter.rm_college_from_loc(entrys)
        self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table))
        if len(entrys) > 0 :
            fields = entrys[0].keys()
            self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
Example #5
0
class scheduler():
    """ To schedule the data flow
    """
    def __init__(self):
        self.db = DBHelper()
        self.dc = datachecker()
        self.sb = submitter()
        self.dd = data_drawer()
        self.sg = segger()
        self.id = indexer()
        self.pc = prechecker()

        self.dmg_on = DMG_ON
        self.test_on = TEST_ON
        if len(sys.argv) < 4:
            self.table_old = "refined_info"
            self.db_old = DB_1
            self.table_new = "extracted_info"
            self.db_new = DB_1
        else:
            self.table_old = sys.argv[1]
            self.db_old = sys.argv[2]
            self.table_new = sys.argv[3]
            self.db_new = sys.argv[4]


    def data_flow(self):

        threads = []
        self.pc.pre_process()
        #remove duplicates
        entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new)
        #insert the new coming entrys into database
        self.db.batch_insert(entrys,self.table_old,self.db_old)
        
        if (self.test_on):
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test))
            t.start()
            threads.append(t)

        if(self.dmg_on):
            #submite the updates to dmg
            self.sb.deal_submit_req(entrys)
            #get the all data from dmg
            entrys = dd.get_entrys()
            self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1)
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1))
            t.start()
            threads.append(t)
        else:
            entrys = self.db.get_results_by_clms(columns='*',table= self.table_old,
                                                 database=self.db_old,isdict=True)
            entrys = sorted(entrys,key=self.sort_func,reverse=True)
        #serilize entrys to protobuffers
        entrys = self.sg.serilize_entrys(entrys)
        #remote call word segging
        entrys = self.sg.word_seg_list(entrys)
  #      #remote call indexing
        self.id.index_list_string(entrys)

        # wait until all threads are over
        for t in threads:
            t.join()

    def sort_func(self,entry):
        info_type = entry.get(INFO_TYPE)
        if info_type == 1:
            time = entry.get(MEETING_TIME)
        else:
            time = entry.get(RELEASE_DATE)
        if time is None:
            return 0
        else :
            return get_timestamp(time)