class DBTestCase(unittest.TestCase): def setUp(self): self.db = DBHelper() self.samples =self.db.get_results_by_clms(columns='*',table= 'samples',database=DB_test,isdict=True) def tearDown(self): self.db = None def test_get_max_group_id(self): res =self.db.get_max_group_id('refined_info_2','xyzb') res_2 =self.db.get_max_group_id('info_template','xyzb') print res print res_2 def test_batch_insert(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True)[:12056] table = 'info_tmp1' self.db.recr_table(table,DB_1,'info_template',DB_1); self.db.batch_insert(entrys,table,DB_1) def test_batch_get(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True)[:2] for key, value in entrys[1].items(): print key,value,type(value)
class filter: def __init__(self): self.db_helper = DBHelper() pass def deal_entrys(self,entrys): entrys = self.filter_illegal(entrys) return entrys # 如果有些字段为空的话不合法,记录到特殊表中 def filter_illegal(self,entrys): LOG.info("Begin to Filter illegal entrys..") not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE] legal_entrys = [] illegal_entrys = [] while len(entrys) > 0: entry = entrys.pop() info_type = entry.get(INFO_TYPE) flag = True for field in not_null_fields: if entry.get(field) is None: illegal_entrys.append(entry) flag = False break if info_type == 0 and flag: if entry.get(RELEASE_DATE) is None: illegal_entrys.append(entry) flag = False if info_type == 1 and flag: if entry.get(MEETING_TIME) is None: illegal_entrys.append(entry) flag = False if flag:legal_entrys.append(entry) LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys))) db_illegal = DB_1 table_illegal = TB_ILLEGAL LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal)) fields = list(INFO_FIELDS) fields.remove(GROUP_ID) fields.extend(['author','tmp_path']) self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields) return legal_entrys def rm_college_from_loc(self,entrys): LOG.info("Dealing Location field to remove college from loc") count = 0 for entry in entrys: info_type = entry.get(INFO_TYPE) if info_type == 0: continue college = entry.get(COLLEGE_NAME) loc = entry.get(MEETING_LOCATION) if loc.startswith(college): count += 1 loc = loc.replace(college,'') entry[MEETING_LOCATION] = loc LOG.info("Removing [%s] College Name from meeting_location field!"%count) return entrys
def test_diff_tables(self): dc = datachecker() db_helper = DBHelper() new_table = 'extracted_full_info_today' old_table = 'info_tmp' old_db = DB_test new_db = DB_1 db_helper.recr_table(old_table,old_db,'info_template',DB_1) entrys =dc.diff_tables(table_old=old_table,table_new=new_table,db_old=old_db,db_new=new_db) db_helper.batch_insert(entrys,table=old_table,db=old_db)
class prechecker: def __init__(self): self.filter = filter() self.db_helper = DBHelper() self.cmp_table = 'refined_list_info' self.table = 'extracted_info' self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE] def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old): new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True) old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True) old_dict = {} for entry in old_list: url = entry.get(ORIGIN_URL) if url is not None: old_dict[url] = None updates = [] for entry in new_list: url = entry.get(ORIGIN_URL) if url in old_dict: continue else: updates.append(entry) old_dict[url] = None fields = entry.keys() self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields) return updates def repair_data(self,entrys,cmp_entrys=None): if cmp_entrys == None: cmp_entrys = self.cmp_entrys LOG.info("repairing Data...") LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys))) cmple_info_dict = collections.defaultdict(dict) for entry in cmp_entrys: origin_url = entry.get(ORIGIN_URL) if origin_url != None: cmple_info_dict[origin_url].update(entry) for entry in entrys: origin_url = entry.get(ORIGIN_URL) if origin_url in cmple_info_dict: for clm in cmple_info_dict[origin_url]: value = entry.get(clm) if value is None: new_value = cmple_info_dict[origin_url][clm] value = new_value entry[clm] = value return entrys def pre_process(self): self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1) cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True) entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True) entrys = self.repair_data(entrys,cmp_entrys) entrys = self.filter.rm_college_from_loc(entrys) self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table)) if len(entrys) > 0 : fields = entrys[0].keys() self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
class scheduler(): """ To schedule the data flow """ def __init__(self): self.db = DBHelper() self.dc = datachecker() self.sb = submitter() self.dd = data_drawer() self.sg = segger() self.id = indexer() self.pc = prechecker() self.dmg_on = DMG_ON self.test_on = TEST_ON if len(sys.argv) < 4: self.table_old = "refined_info" self.db_old = DB_1 self.table_new = "extracted_info" self.db_new = DB_1 else: self.table_old = sys.argv[1] self.db_old = sys.argv[2] self.table_new = sys.argv[3] self.db_new = sys.argv[4] def data_flow(self): threads = [] self.pc.pre_process() #remove duplicates entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new) #insert the new coming entrys into database self.db.batch_insert(entrys,self.table_old,self.db_old) if (self.test_on): t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test)) t.start() threads.append(t) if(self.dmg_on): #submite the updates to dmg self.sb.deal_submit_req(entrys) #get the all data from dmg entrys = dd.get_entrys() self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1) t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1)) t.start() threads.append(t) else: entrys = self.db.get_results_by_clms(columns='*',table= self.table_old, database=self.db_old,isdict=True) entrys = sorted(entrys,key=self.sort_func,reverse=True) #serilize entrys to protobuffers entrys = self.sg.serilize_entrys(entrys) #remote call word segging entrys = self.sg.word_seg_list(entrys) # #remote call indexing self.id.index_list_string(entrys) # wait until all threads are over for t in threads: t.join() def sort_func(self,entry): info_type = entry.get(INFO_TYPE) if info_type == 1: time = entry.get(MEETING_TIME) else: time = entry.get(RELEASE_DATE) if time is None: return 0 else : return get_timestamp(time)