class DBTestCase(unittest.TestCase): def setUp(self): self.db = DBHelper() self.samples =self.db.get_results_by_clms(columns='*',table= 'samples',database=DB_test,isdict=True) def tearDown(self): self.db = None def test_get_max_group_id(self): res =self.db.get_max_group_id('refined_info_2','xyzb') res_2 =self.db.get_max_group_id('info_template','xyzb') print res print res_2 def test_batch_insert(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True)[:12056] table = 'info_tmp1' self.db.recr_table(table,DB_1,'info_template',DB_1); self.db.batch_insert(entrys,table,DB_1) def test_batch_get(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True)[:2] for key, value in entrys[1].items(): print key,value,type(value)
class DMGTestCase(unittest.TestCase): def setUp(self): self.sb = submitter() self.dd = data_drawer() self.db = DBHelper() self.samples =self.db.get_results_by_clms(columns='*',table= 'samples',database=DB_test,isdict=True) def tearDown(self): self.sb = None self.dd = None self.db = None def test_submit_samples(self): failed_ids = self.sb.submit_to_RB(self.samples) self.assertTrue(len(self.samples) > 0) print failed_ids def test_submit_large_entrys(self): entrys = self.db.get_results_by_clms(columns='*', table='refined_info', database=DB_1, isdict=True)[:] faild_ids = self.sb.submit_to_RB(entrys) self.assertTrue(len(self.samples) > 0) print failed_ids def test_new_added_fields(self): for entry in self.samples: entry['last_mod_time'] = 10086 entry['group_id'] = 87217 entry['collegeName'] = 'ouyangming' failed_ids = self.sb.submit_to_RB(self.samples) def test_retrive(self): entrys = self.dd.get_entrys(2000,is_full=False) for entry in entrys: #self.assertIn(entry['info_id'],info_ids) for key,value in entry.items(): print key,value print "###############################" def test_submit_error_handle(self): error_url = "http://www.panguso.com" self.sb.table = 'samples' self.sb.db = DB_test self.sb.deal_submit_req(self.samples[:0],error_url) def test_post_error(self): posturl = P
class GroupTestCase(unittest.TestCase): def setUp(self): self.dc = datachecker() self.db = DBHelper() self.samples =self.db.get_results_by_clms(columns='*',table= 'samples',database=DB_test,isdict=True) def tearDown(self): self.dc = None self.samples = None def test_group_id(self): table_new = 'samples_2' table_old = 'info' entrys = self.dc.diff_tables(table_old,table_new,DB_test,DB_test) rm_field(INFO_ID,entrys) datastore.update_info_table(entrys,table_old,DB_test)
class ExtractTestCase(unittest.TestCase): def setUp(self): self.db = DBHelper() columns = ['recruit_title'] self.recruit_titles = self.db.get_results_by_clms(columns,'refined_info',DB_1) def tearDown(self): self.db = None def test_extract_name(self): prefix = '$#@&' for recruit_title in self.recruit_titles[:]: comp_name = extract.extract_compname(recruit_title[0]) if comp_name.startswith(prefix): recruit_title = prefix + " " + recruit_title[0] print "[Origin]:%s"%recruit_title print "[CompNM]:%s"%comp_name
class prechecker: def __init__(self): self.filter = filter() self.db_helper = DBHelper() self.cmp_table = 'refined_list_info' self.table = 'extracted_info' self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE] def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old): new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True) old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True) old_dict = {} for entry in old_list: url = entry.get(ORIGIN_URL) if url is not None: old_dict[url] = None updates = [] for entry in new_list: url = entry.get(ORIGIN_URL) if url in old_dict: continue else: updates.append(entry) old_dict[url] = None fields = entry.keys() self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields) return updates def repair_data(self,entrys,cmp_entrys=None): if cmp_entrys == None: cmp_entrys = self.cmp_entrys LOG.info("repairing Data...") LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys))) cmple_info_dict = collections.defaultdict(dict) for entry in cmp_entrys: origin_url = entry.get(ORIGIN_URL) if origin_url != None: cmple_info_dict[origin_url].update(entry) for entry in entrys: origin_url = entry.get(ORIGIN_URL) if origin_url in cmple_info_dict: for clm in cmple_info_dict[origin_url]: value = entry.get(clm) if value is None: new_value = cmple_info_dict[origin_url][clm] value = new_value entry[clm] = value return entrys def pre_process(self): self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1) cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True) entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True) entrys = self.repair_data(entrys,cmp_entrys) entrys = self.filter.rm_college_from_loc(entrys) self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table)) if len(entrys) > 0 : fields = entrys[0].keys() self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
from common.DataBaseHelper import DBHelper import sys reload(sys) sys.setdefaultencoding('utf-8') import re import codecs db_helper = DBHelper() res = db_helper.get_results_by_clms('*','site_domain','xyzb_test') def get_dict_from_file(file): file = codecs.open(file,'r','utf-8') list = [] rs = re.compile('[ ]+') for line in file.readlines(): line = rs.split(line) list.append(line) file.close() return list def test_match(): file = open('domain_college','w') for line in site_stats: domain = line[0].strip() domain_pat = re.compile(domain) suc = False for site_domain in site_domains: if domain_pat.search(site_domain): file.write("%s\t%s\n"%(domain,site_domains[site_domain])) suc = True break
entry[key] = value entry_json = json.dumps(entry) self.dumpjson_to_file(entry_json) return res def dumpjson_to_file(self, item): # res = json.dumps(json_list) res = str(item) output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M") output = os.path.join(self.output_dir, output_file) try: json_file = open(output, "a") json_file.write(res) json_file.write("\n") json_file.close() except IOError, e: LOG.error(e) sys.exit(-1) finally: if json_file: json_file.close() JD = JsonDumper(output_dir) if __name__ == "__main__": db_helper = DBHelper() entrys = db_helper.get_results_by_clms(columns="*", table="info_2", database="xyzb", isdict=True)[:] jd = JsonDumper(output_dir) jd.dump_entrys(entrys)
import array reload(sys) sys.setdefaultencoding("utf-8") host = "10.10.211.101" port = 9090 dw = data_drawer() #entrys = dw.get_entrys() mdoc = None res = [] db_helper = DBHelper() entrys = db_helper.get_results_by_clms(columns="*",table='refined_info',isdict=True)[:] city_id = load_city_map(CITY_FILE_PATH) college_id = load_college_map(COLLEGE_FILE_PATH) missed_colleges = set() #using protobuff to serilize entrys def serilize_entrys(entrys): res = [] for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() for key,value in entry.items(): # 空值的跳过 if value is None: continue
class IndexTestCase(unittest.TestCase): def setUp(self): self.db = DBHelper() self.samples =self.db.get_results_by_clms(columns='*',table= 'samples',database=DB_test,isdict=True) self.segger = segger() self.indexer = indexer() self.dd = data_drawer() def tearDown(self): self.db = None self.samples = None self.segger = None self.indexer = None def test_convert_to_protoentry(self): self.segger.serilize_entrys(self.samples) def test_serilize_entrys(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True) res = self.segger.serilize_entrys(entrys[:10]) res = self.segger.word_seg_list(res) retdoc = merged_doc_pb2.MergedDocInfo() for entry in res: retdoc.ParseFromString(entry) for i in range(0, len(retdoc.terms_info.term_infos)): print 'term_sign:', retdoc.terms_info.term_infos[i].term_sign print 'term_weight:',retdoc.terms_info.term_infos[i].term_weight print '##########' def test_record_protos(self): file = open("proto.output",'wb') res = self.segger.serilize_entrys(self.samples[3:10]) res = self.segger.word_seg_list(res) print "length after segging res is ",len(res) retdoc = merged_doc_pb2.MergedDocInfo() for entry in res: arr = array.array('I') length = len(entry) li = [length] arr.fromlist(li) file.write(arr.tostring()) file.write(entry) file.close() def test_serilize_field(self): #entrys =self.db.get_results_by_clms(columns='*',table= # 'info_tes',database=DB_2,isdict=True) res = self.segger.serilize_entrys(self.samples) res = self.segger.word_seg_list(res) def test_word_seg_list(self): protobufs = self.segger.serilize_entrys(self.samples[:2]) res = self.segger.word_seg_list(protobufs) self.indexer.index_list_string(res) def test_index_start_stop(self): protobufs = self.segger.serilize_entrys(self.samples[:2]) #res = self.segger.word_seg_list(protobufs) self.indexer.transport.open() print self.indexer.start_index() print self.indexer.stop_index() self.indexer.transport.close() #print self.indexer.index_list_string(res) def test_index_list_string(self): entrys =self.db.get_results_by_clms(columns='*',table= 'refined_info',database=DB_1,isdict=True)[:] protobufs = self.segger.serilize_entrys(entrys) res = self.segger.word_seg_list(protobufs) self.indexer.index_list_string(res) def test_split_city(self): conv = TypeConvertor() str= u"北京" print str.strip().split(',') print conv.city_id.get(str) ids = conv.map_cities(str) print ids def test_0_entrys_index(self): protobufs = self.segger.serilize_entrys([]) res = self.segger.word_seg_list(protobufs) self.indexer.index_list_string(res) def test_city_id(self): conv = TypeConvertor() print conv.city_id def test_college_id(self): conv = TypeConvertor() print conv.map_college(u"清华大学") self.assertEqual(1,conv.map_college(u"北京大学"))
class scheduler(): """ To schedule the data flow """ def __init__(self): self.db = DBHelper() self.dc = datachecker() self.sb = submitter() self.dd = data_drawer() self.sg = segger() self.id = indexer() self.pc = prechecker() self.dmg_on = DMG_ON self.test_on = TEST_ON if len(sys.argv) < 4: self.table_old = "refined_info" self.db_old = DB_1 self.table_new = "extracted_info" self.db_new = DB_1 else: self.table_old = sys.argv[1] self.db_old = sys.argv[2] self.table_new = sys.argv[3] self.db_new = sys.argv[4] def data_flow(self): threads = [] self.pc.pre_process() #remove duplicates entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new) #insert the new coming entrys into database self.db.batch_insert(entrys,self.table_old,self.db_old) if (self.test_on): t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test)) t.start() threads.append(t) if(self.dmg_on): #submite the updates to dmg self.sb.deal_submit_req(entrys) #get the all data from dmg entrys = dd.get_entrys() self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1) t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1)) t.start() threads.append(t) else: entrys = self.db.get_results_by_clms(columns='*',table= self.table_old, database=self.db_old,isdict=True) entrys = sorted(entrys,key=self.sort_func,reverse=True) #serilize entrys to protobuffers entrys = self.sg.serilize_entrys(entrys) #remote call word segging entrys = self.sg.word_seg_list(entrys) # #remote call indexing self.id.index_list_string(entrys) # wait until all threads are over for t in threads: t.join() def sort_func(self,entry): info_type = entry.get(INFO_TYPE) if info_type == 1: time = entry.get(MEETING_TIME) else: time = entry.get(RELEASE_DATE) if time is None: return 0 else : return get_timestamp(time)