Example #1
0
class DBTestCase(unittest.TestCase):
    def setUp(self):
        self.db = DBHelper()
        self.samples =self.db.get_results_by_clms(columns='*',table=
                                'samples',database=DB_test,isdict=True)
    def tearDown(self):
        self.db = None

    def test_get_max_group_id(self):
        res =self.db.get_max_group_id('refined_info_2','xyzb')
        res_2 =self.db.get_max_group_id('info_template','xyzb')
        print res
        print res_2

    def test_batch_insert(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:12056]
        table = 'info_tmp1'
        self.db.recr_table(table,DB_1,'info_template',DB_1);
        self.db.batch_insert(entrys,table,DB_1)

    def test_batch_get(self):
        entrys =self.db.get_results_by_clms(columns='*',table=
                                'refined_info',database=DB_1,isdict=True)[:2]
        for key, value in entrys[1].items():
            print key,value,type(value)
Example #2
0
 def test_diff_tables(self):
     dc = datachecker()
     db_helper = DBHelper()
     new_table = 'extracted_full_info_today'
     old_table = 'info_tmp'
     old_db = DB_test
     new_db = DB_1
     db_helper.recr_table(old_table,old_db,'info_template',DB_1)
     entrys =dc.diff_tables(table_old=old_table,table_new=new_table,db_old=old_db,db_new=new_db)
     db_helper.batch_insert(entrys,table=old_table,db=old_db)
Example #3
0
class scheduler():
    """ To schedule the data flow
    """
    def __init__(self):
        self.db = DBHelper()
        self.dc = datachecker()
        self.sb = submitter()
        self.dd = data_drawer()
        self.sg = segger()
        self.id = indexer()
        self.pc = prechecker()

        self.dmg_on = DMG_ON
        self.test_on = TEST_ON
        if len(sys.argv) < 4:
            self.table_old = "refined_info"
            self.db_old = DB_1
            self.table_new = "extracted_info"
            self.db_new = DB_1
        else:
            self.table_old = sys.argv[1]
            self.db_old = sys.argv[2]
            self.table_new = sys.argv[3]
            self.db_new = sys.argv[4]


    def data_flow(self):

        threads = []
        self.pc.pre_process()
        #remove duplicates
        entrys = self.dc.diff_tables(self.table_old,self.table_new,self.db_old,self.db_new)
        #insert the new coming entrys into database
        self.db.batch_insert(entrys,self.table_old,self.db_old)
        
        if (self.test_on):
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,self.table_old,DB_test))
            t.start()
            threads.append(t)

        if(self.dmg_on):
            #submite the updates to dmg
            self.sb.deal_submit_req(entrys)
            #get the all data from dmg
            entrys = dd.get_entrys()
            self.db.recr_table(TB_DMG,DB_1,TB_TEMPLATE,DB_1)
            t = threading.Thread(target=self.db.batch_insert,args=(entrys,TB_DMG,DB_1))
            t.start()
            threads.append(t)
        else:
            entrys = self.db.get_results_by_clms(columns='*',table= self.table_old,
                                                 database=self.db_old,isdict=True)
            entrys = sorted(entrys,key=self.sort_func,reverse=True)
        #serilize entrys to protobuffers
        entrys = self.sg.serilize_entrys(entrys)
        #remote call word segging
        entrys = self.sg.word_seg_list(entrys)
  #      #remote call indexing
        self.id.index_list_string(entrys)

        # wait until all threads are over
        for t in threads:
            t.join()

    def sort_func(self,entry):
        info_type = entry.get(INFO_TYPE)
        if info_type == 1:
            time = entry.get(MEETING_TIME)
        else:
            time = entry.get(RELEASE_DATE)
        if time is None:
            return 0
        else :
            return get_timestamp(time)