Example #1
0
class DBTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.db = None

    def test_get_max_group_id(self):
        res =self.db.get_max_group_id('refined_info_2','xyzb')
        res_2 =self.db.get_max_group_id('info_template','xyzb')
        print res
        print res_2

    def test_batch_insert(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:12056]
        table = 'info_tmp1'
        self.db.recr_table(table,DB_1,'info_template',DB_1);
        self.db.batch_insert(entrys,table,DB_1)

    def test_batch_get(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:2]
        for key, value in entrys[1].items():
            print key,value,type(value)
Example #2
0
class DMGTestCase(unittest.TestCase):
    def setUp(self):
        self.sb = submitter()
        self.dd = data_drawer()
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.sb = None
        self.dd = None
        self.db = None

    def test_submit_samples(self):
        failed_ids = self.sb.submit_to_RB(self.samples)
        self.assertTrue(len(self.samples) > 0)
        print failed_ids

    def test_submit_large_entrys(self):
        entrys = self.db.get_results_by_clms(columns='*', table='refined_info',
                                            database=DB_1, isdict=True)[:]
        faild_ids = self.sb.submit_to_RB(entrys)
        self.assertTrue(len(self.samples) > 0)
        print failed_ids

    def test_new_added_fields(self):
        for entry in self.samples:
            entry['last_mod_time'] = 10086
            entry['group_id'] = 87217
            entry['collegeName'] = 'ouyangming'
        failed_ids = self.sb.submit_to_RB(self.samples)

    def test_retrive(self):
        entrys = self.dd.get_entrys(2000,is_full=False)
        for entry in entrys:
            #self.assertIn(entry['info_id'],info_ids)
            for key,value in entry.items():
                print key,value
            print "###############################"

    def test_submit_error_handle(self):
        error_url = "http://www.panguso.com"
        self.sb.table = 'samples'
        self.sb.db = DB_test
        self.sb.deal_submit_req(self.samples[:0],error_url)

    def test_post_error(self):
        posturl = P
Example #3
0
class GroupTestCase(unittest.TestCase):
    def setUp(self):
        self.dc = datachecker()
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.dc = None
        self.samples = None

    def test_group_id(self):
        table_new = 'samples_2'
        table_old = 'info'
        entrys = self.dc.diff_tables(table_old,table_new,DB_test,DB_test)
        rm_field(INFO_ID,entrys)
        datastore.update_info_table(entrys,table_old,DB_test)
class ExtractTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        columns = ['recruit_title']
        self.recruit_titles = self.db.get_results_by_clms(columns,'refined_info',DB_1)

    def tearDown(self):
        self.db = None
        

    def test_extract_name(self):
        prefix = '$#@&'
        for recruit_title in self.recruit_titles[:]:
            comp_name = extract.extract_compname(recruit_title[0])
            if comp_name.startswith(prefix):
                recruit_title = prefix + " " + recruit_title[0]
            print "[Origin]:%s"%recruit_title
            print "[CompNM]:%s"%comp_name
Example #5
0
class prechecker:
    
    def __init__(self):
        self.filter = filter()
        self.db_helper = DBHelper()
        self.cmp_table = 'refined_list_info'
        self.table = 'extracted_info'
        self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
    
    def rm_dup_list_info(self,tb_new,db_new,tb_old,db_old):
        new_list = self.db_helper.get_results_by_clms("*",tb_new,db_new,True)
        old_list = self.db_helper.get_results_by_clms("*",tb_old,db_old,True)
        old_dict = {}
        
        for entry in old_list:
            url = entry.get(ORIGIN_URL)
            if url is not None:
                old_dict[url] = None
        updates = []
        for entry in new_list:
            url = entry.get(ORIGIN_URL)
            if url in old_dict:
                continue
            else:
                updates.append(entry)
                old_dict[url] = None
        fields = entry.keys()
        self.db_helper.batch_insert(updates,"refined_list_info",DB_1,fields)
        return updates


    def repair_data(self,entrys,cmp_entrys=None):
        if cmp_entrys == None:
            cmp_entrys = self.cmp_entrys
        LOG.info("repairing Data...")
        LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys)))
        cmple_info_dict = collections.defaultdict(dict)
        for entry in cmp_entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url != None:
                cmple_info_dict[origin_url].update(entry)
        for entry in entrys:
            origin_url = entry.get(ORIGIN_URL)
            if origin_url in cmple_info_dict:
                for clm in cmple_info_dict[origin_url]:
                    value = entry.get(clm)
                    if value is None:
                        new_value = cmple_info_dict[origin_url][clm]
                        value = new_value
                        entry[clm] = value
        return entrys

    def pre_process(self):
        
        self.rm_dup_list_info('extracted_list_info',DB_1,self.cmp_table,DB_1)
        cmp_entrys = self.db_helper.get_results_by_clms(self.cmp_clms,self.cmp_table,DB_1,isdict=True)
        entrys = self.db_helper.get_results_by_clms("*",self.table,DB_1,isdict=True)
        entrys = self.repair_data(entrys,cmp_entrys)
        entrys = self.filter.rm_college_from_loc(entrys)
        self.db_helper.exe_sql('delete from %s.%s'%(DB_1,self.table))
        if len(entrys) > 0 :
            fields = entrys[0].keys()
            self.db_helper.batch_insert(entrys,self.table,DB_1,fields)
Example #6
0
from common.DataBaseHelper import DBHelper
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import re
import codecs

db_helper = DBHelper()
res = db_helper.get_results_by_clms('*','site_domain','xyzb_test')

def get_dict_from_file(file):
    file = codecs.open(file,'r','utf-8')
    list = []
    rs = re.compile('[ ]+')
    for line in file.readlines():
        line = rs.split(line)
        list.append(line) 
    file.close()
    return list 

def test_match():
    file = open('domain_college','w')
    for line in site_stats:
        domain = line[0].strip()
        domain_pat = re.compile(domain)
        suc = False
        for site_domain in site_domains:
            if domain_pat.search(site_domain):
                file.write("%s\t%s\n"%(domain,site_domains[site_domain]))
                suc = True
                break
Example #7
0
                entry[key] = value
            entry_json = json.dumps(entry)
            self.dumpjson_to_file(entry_json)
        return res

    def dumpjson_to_file(self, item):
        # res = json.dumps(json_list)
        res = str(item)
        output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M")
        output = os.path.join(self.output_dir, output_file)
        try:
            json_file = open(output, "a")
            json_file.write(res)
            json_file.write("\n")
            json_file.close()
        except IOError, e:
            LOG.error(e)
            sys.exit(-1)
        finally:
            if json_file:
                json_file.close()


JD = JsonDumper(output_dir)

if __name__ == "__main__":
    db_helper = DBHelper()
    entrys = db_helper.get_results_by_clms(columns="*", table="info_2", database="xyzb", isdict=True)[:]
    jd = JsonDumper(output_dir)
    jd.dump_entrys(entrys)
Example #8
0
import array

reload(sys)
sys.setdefaultencoding("utf-8")

host = "10.10.211.101"
port = 9090


dw = data_drawer()
#entrys = dw.get_entrys()
mdoc = None
res = []
db_helper = DBHelper()

entrys = db_helper.get_results_by_clms(columns="*",table='refined_info',isdict=True)[:]
city_id = load_city_map(CITY_FILE_PATH)
college_id = load_college_map(COLLEGE_FILE_PATH)
missed_colleges = set()

#using protobuff to serilize entrys
def serilize_entrys(entrys):
    res = []
    for entry in entrys:
        mdoc = merged_doc_pb2.MergedDocInfo()
        for key,value in entry.items():

            # 空值的跳过
            if value is None:
                continue
Example #9
0
class IndexTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
        self.segger = segger()
        self.indexer = indexer()
        self.dd = data_drawer()

    def tearDown(self):
        self.db = None
        self.samples = None
        self.segger = None
        self.indexer = None

    def test_convert_to_protoentry(self):
        self.segger.serilize_entrys(self.samples)

    def test_serilize_entrys(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)
        res = self.segger.serilize_entrys(entrys[:10])
        res = self.segger.word_seg_list(res)
        retdoc = merged_doc_pb2.MergedDocInfo()
        for entry in res:
            retdoc.ParseFromString(entry)
            for i in range(0, len(retdoc.terms_info.term_infos)):
                print 'term_sign:', retdoc.terms_info.term_infos[i].term_sign
                print 'term_weight:',retdoc.terms_info.term_infos[i].term_weight
            print '##########'

    def test_record_protos(self):
        file = open("proto.output",'wb')
        res = self.segger.serilize_entrys(self.samples[3:10])
        res = self.segger.word_seg_list(res)
        print "length after segging res is ",len(res)
        retdoc = merged_doc_pb2.MergedDocInfo()
        for entry in res:
            arr = array.array('I')
            length = len(entry)
            li = [length]
            arr.fromlist(li)
            file.write(arr.tostring())
            file.write(entry)
        file.close()

    def test_serilize_field(self):
        #entrys =self.db.get_results_by_clms(columns='*',table=
         #                       'info_tes',database=DB_2,isdict=True)
        res = self.segger.serilize_entrys(self.samples)
        res = self.segger.word_seg_list(res)



    def test_word_seg_list(self):
        protobufs = self.segger.serilize_entrys(self.samples[:2])
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)

    def test_index_start_stop(self):
        protobufs = self.segger.serilize_entrys(self.samples[:2])
        #res = self.segger.word_seg_list(protobufs)
        self.indexer.transport.open()
        print self.indexer.start_index()
        print self.indexer.stop_index()
        self.indexer.transport.close()
        #print self.indexer.index_list_string(res)

    def test_index_list_string(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:]
        protobufs = self.segger.serilize_entrys(entrys)
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)

    def test_split_city(self):
        conv = TypeConvertor()
        str= u"北京"
        print str.strip().split(',')
        print conv.city_id.get(str)
        ids = conv.map_cities(str)
        print ids

    def test_0_entrys_index(self):
        protobufs = self.segger.serilize_entrys([])
        res = self.segger.word_seg_list(protobufs)
        self.indexer.index_list_string(res)
        
    def test_city_id(self):
        conv = TypeConvertor()
        print conv.city_id

    def test_college_id(self):
        conv = TypeConvertor()
        print conv.map_college(u"清华大学")
        self.assertEqual(1,conv.map_college(u"北京大学"))
Example #10
0
class scheduler():
    """ To schedule the data flow
    """
    def __init__(self):
        self.db = DBHelper()
        self.dc = datachecker()
        self.sb = submitter()
        self.dd = data_drawer()
        self.sg = segger()
        self.id = indexer()
        self.pc = prechecker()

        self.dmg_on = DMG_ON
        self.test_on = TEST_ON
        if len(sys.argv) < 4:
            self.table_old = "refined_info"
            self.db_old = DB_1
            self.table_new = "extracted_info"
            self.db_new = DB_1
        else:
            self.table_old = sys.argv[1]
            self.db_old = sys.argv[2]
            self.table_new = sys.argv[3]
            self.db_new = sys.argv[4]


    def data_flow(self):

        threads = []
        self.pc.pre_process()
        #remove duplicates
        entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new)
        #insert the new coming entrys into database
        self.db.batch_insert(entrys,self.table_old,self.db_old)
        
        if (self.test_on):
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test))
            t.start()
            threads.append(t)

        if(self.dmg_on):
            #submite the updates to dmg
            self.sb.deal_submit_req(entrys)
            #get the all data from dmg
            entrys = dd.get_entrys()
            self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1)
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1))
            t.start()
            threads.append(t)
        else:
            entrys = self.db.get_results_by_clms(columns='*',table= self.table_old,
                                                 database=self.db_old,isdict=True)
            entrys = sorted(entrys,key=self.sort_func,reverse=True)
        #serilize entrys to protobuffers
        entrys = self.sg.serilize_entrys(entrys)
        #remote call word segging
        entrys = self.sg.word_seg_list(entrys)
  #      #remote call indexing
        self.id.index_list_string(entrys)

        # wait until all threads are over
        for t in threads:
            t.join()

    def sort_func(self,entry):
        info_type = entry.get(INFO_TYPE)
        if info_type == 1:
            time = entry.get(MEETING_TIME)
        else:
            time = entry.get(RELEASE_DATE)
        if time is None:
            return 0
        else :
            return get_timestamp(time)