Ejemplo n.º 1
0
    def init_app(self):
        ''' 服务矜持初始化 '''
        ret = 0
        self.init_log(logging.DEBUG)
        self._logger.info("application init start.")
        Spider_MySQL_DBProcess.instance().set_db_info(db_usr="******",
                                                      db_password="******")
        ret = Spider_MySQL_DBProcess.instance().connect_to_db()
        if (ret != 0):
            return ret

        Spider_Openner_Builder.instance().config_builder(
            {'http': 'http://proxy.tencent.com:8080/'})
        #Spider_Openner_Builder.instance().config_builder()

        Spider_Schedule_Manager.instance()
        Spider_Schedule_Manager.instance().init(80001,\
             10,\
             True,\
             "E:/My.Travel/",
             "127.0.0.1",\
             "root",\
             "zxzxzx")

        return 0
    def init(self,\
             crawl_website_id,\
             crawl_task_num,\
             saveto_db,\
             filesave_dir,\
             db_ipaddress,\
             db_usr,\
             db_password,\
             db_port=3306):
        '''
                    初始化,确定调度器如何工作
        '''
        ret = 0
        self._db_ipaddress = db_ipaddress
        self._db_user = db_usr
        self._db_password = db_password
        self._db_port = db_port

        self._crawl_website_id = crawl_website_id

        if (not os.path.exists("./%s/" % self._crawl_website_id)):
            os.mkdir("./%s/" % self._crawl_website_id)

        self._saveto_db = saveto_db
        self._filesave_dir = filesave_dir

        #为了多线程考虑,用累成员对象
        self._mysql_process = Spider_MySQL_DBProcess()
        self._mysql_process.set_db_info(db_ipaddress=self._db_ipaddress,\
                                        db_usr=self._db_user,\
                                        db_password=self._db_password,
                                        db_port=self._db_port)

        ret = self._mysql_process.connect_to_db()
        if (ret != 0):
            return ret

        self._crawl_queue = Queue.Queue(
            Spider_Schedule_Manager.CRAWL_QUEUE_SIZE)

        #Windows下的Ctrl+C 的处理
        signal.signal(signal.SIGINT, Spider_Schedule_Manager.exit_handler)

        self._crawl_task_num = crawl_task_num

        task_count = 0
        while task_count < self._crawl_task_num:

            crawl_task = Crawl_Website_Thread(self.create_crawl_website(),\
                                              self._crawl_queue,\
                                              self._saveto_db)
            crawl_task.start()
            self._crawl_task_list.append(crawl_task)
            task_count += 1
        return 0
 def create_crawl_website(self):
     '''
     
     '''
     mysql_hdl = Spider_MySQL_DBProcess()
     mysql_hdl.set_db_info(db_ipaddress=self._db_ipaddress,\
                           db_usr=self._db_user,\
                           db_password=self._db_password,
                           db_port=self._db_port)
     ret = mysql_hdl.connect_to_db()
     if (ret != 0):
         return None
     crawl_base = None
     if (self._crawl_website_id == Spider_Website_Base.WEBSITE_ID_BOOKING):
         crawl_base = Spider_Website_Booking(mysql_hdl, self._filesave_dir)
     else:
         assert (False)
     return crawl_base
 def create_crawl_website(self):
     '''
     
     '''
     mysql_hdl = Spider_MySQL_DBProcess()
     mysql_hdl.set_db_info(db_ipaddress=self._db_ipaddress,\
                           db_usr=self._db_user,\
                           db_password=self._db_password,
                           db_port=self._db_port)
     ret = mysql_hdl.connect_to_db()
     if (ret != 0):
         return None
     crawl_base = None
     if (self._crawl_website_id == Spider_Website_Base.WEBSITE_ID_BOOKING):
         crawl_base = Spider_Website_Booking(mysql_hdl,self._filesave_dir)
     else :
         assert(False)
     return crawl_base
 def init(self,\
          crawl_website_id,\
          crawl_task_num,\
          saveto_db,\
          filesave_dir,\
          db_ipaddress,\
          db_usr,\
          db_password,\
          db_port=3306):
     '''
                 初始化,确定调度器如何工作
     '''
     ret = 0
     self._db_ipaddress = db_ipaddress
     self._db_user = db_usr
     self._db_password = db_password
     self._db_port = db_port
     
     self._crawl_website_id = crawl_website_id
     
     if (not os.path.exists("./%s/"%self._crawl_website_id)):
         os.mkdir("./%s/"%self._crawl_website_id)
     
     self._saveto_db = saveto_db
     self._filesave_dir = filesave_dir
     
     #为了多线程考虑,用累成员对象
     self._mysql_process = Spider_MySQL_DBProcess()
     self._mysql_process.set_db_info(db_ipaddress=self._db_ipaddress,\
                                     db_usr=self._db_user,\
                                     db_password=self._db_password,
                                     db_port=self._db_port)
     
     ret = self._mysql_process.connect_to_db()
     if (ret != 0):
         return ret
     
     self._crawl_queue = Queue.Queue(Spider_Schedule_Manager.CRAWL_QUEUE_SIZE)
     
     #Windows下的Ctrl+C 的处理
     signal.signal(signal.SIGINT, Spider_Schedule_Manager.exit_handler)
     
     self._crawl_task_num = crawl_task_num
     
     task_count = 0
     while task_count < self._crawl_task_num :
         
         crawl_task = Crawl_Website_Thread(self.create_crawl_website(),\
                                           self._crawl_queue,\
                                           self._saveto_db)
         crawl_task.start()
         self._crawl_task_list.append(crawl_task)
         task_count+=1
     return 0
Ejemplo n.º 6
0
 def init_app(self): 
     ''' 服务矜持初始化 '''
     ret = 0
     self.init_log(logging.DEBUG)
     self._logger.info("application init start.")
     Spider_MySQL_DBProcess.instance().set_db_info( db_usr="******", db_password="******")
     ret = Spider_MySQL_DBProcess.instance().connect_to_db()
     if (ret != 0):
         return ret
     
     Spider_Openner_Builder.instance().config_builder({'http': 'http://proxy.tencent.com:8080/'})
     #Spider_Openner_Builder.instance().config_builder()
     
     Spider_Schedule_Manager.instance()
     Spider_Schedule_Manager.instance().init(80001,\
          10,\
          True,\
          "E:/My.Travel/",
          "127.0.0.1",\
          "root",\
          "zxzxzx")
     
     return 0
class Spider_Schedule_Manager:
    '''
           爬虫的调度管理器 
    '''
    instance_ = None

    #队列最大长度
    CRAWL_QUEUE_SIZE = 512
    #一次取得MYSQL队列数据
    ONCE_QUERY_ARRAY_SIZE = 256

    def __init__(self):
        self._db_ipaddress = ""
        self._db_user = ""
        self._db_password = ""
        self._db_port = 3306
        self._mysql_process = None

        self._crawl_website_id = Spider_Website_Base.WEBSITE_ID_BOOKING

        self._crawl_queue = None

        self._crawl_task_num = 0
        self._crawl_task_list = []

        self._filesave_dir = ""

        #日志模块,免得每个人都去折腾了
        self._logger = logging.getLogger()



    def init(self,\
             crawl_website_id,\
             crawl_task_num,\
             saveto_db,\
             filesave_dir,\
             db_ipaddress,\
             db_usr,\
             db_password,\
             db_port=3306):
        '''
                    初始化,确定调度器如何工作
        '''
        ret = 0
        self._db_ipaddress = db_ipaddress
        self._db_user = db_usr
        self._db_password = db_password
        self._db_port = db_port

        self._crawl_website_id = crawl_website_id

        if (not os.path.exists("./%s/" % self._crawl_website_id)):
            os.mkdir("./%s/" % self._crawl_website_id)

        self._saveto_db = saveto_db
        self._filesave_dir = filesave_dir

        #为了多线程考虑,用累成员对象
        self._mysql_process = Spider_MySQL_DBProcess()
        self._mysql_process.set_db_info(db_ipaddress=self._db_ipaddress,\
                                        db_usr=self._db_user,\
                                        db_password=self._db_password,
                                        db_port=self._db_port)

        ret = self._mysql_process.connect_to_db()
        if (ret != 0):
            return ret

        self._crawl_queue = Queue.Queue(
            Spider_Schedule_Manager.CRAWL_QUEUE_SIZE)

        #Windows下的Ctrl+C 的处理
        signal.signal(signal.SIGINT, Spider_Schedule_Manager.exit_handler)

        self._crawl_task_num = crawl_task_num

        task_count = 0
        while task_count < self._crawl_task_num:

            crawl_task = Crawl_Website_Thread(self.create_crawl_website(),\
                                              self._crawl_queue,\
                                              self._saveto_db)
            crawl_task.start()
            self._crawl_task_list.append(crawl_task)
            task_count += 1
        return 0

    def create_crawl_website(self):
        '''
        
        '''
        mysql_hdl = Spider_MySQL_DBProcess()
        mysql_hdl.set_db_info(db_ipaddress=self._db_ipaddress,\
                              db_usr=self._db_user,\
                              db_password=self._db_password,
                              db_port=self._db_port)
        ret = mysql_hdl.connect_to_db()
        if (ret != 0):
            return None
        crawl_base = None
        if (self._crawl_website_id == Spider_Website_Base.WEBSITE_ID_BOOKING):
            crawl_base = Spider_Website_Booking(mysql_hdl, self._filesave_dir)
        else:
            assert (False)
        return crawl_base

    def readdb_countrylist_tocrawl(self, country_id=0):
        '''
                    从DB读取国际的信息(列表),同时对他们进行爬取  
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_country_url(get_list, \
                                                               self._crawl_website_id,\
                                                               country_id,
                                                               0,
                                                               0,
                                                               start,
                                                               Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)

            if (ret != 0):
                return ret

            len_ary = len(get_list)
            start += len_ary
            self._logger.info(
                "Country data get all[%d] this time [%d] array to process." %
                (start, len_ary))
            if (len_ary == 0):
                self._logger.info("Country data array process done.!")
                break
            for item in get_list:
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.pub(item, timeout=1)
                    except Queue.Full:
                        self._logger.info(
                            "Queue is full.wait 1 second don't put into.qsize:[%d]."
                            % self._crawl_queue.qsize())
                        continue
                    else:
                        break
        return 0

    def readdb_citylist_tocrawl(self,
                                country_id=0,
                                city_id=0,
                                saveto_db=False):
        ''' 
                    读取城市的列表进行爬行处理 
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_city_url(get_list, \
                                                            self._crawl_website_id, \
                                                            country_id,\
                                                            city_id,\
                                                            0,\
                                                            start,\
                                                            Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)
            if (ret != 0):
                return ret

            len_ary = len(get_list)
            start += len_ary
            self._logger.info(
                "City data get all[%d] this time [%d] array to process." %
                (start, len_ary))
            if (len_ary == 0):
                self._logger.info("City data array process done.!")
                break
            for item in get_list:
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.put(item, timeout=1)
                    except Queue.Full:
                        self._logger.info(
                            "Queue is full.wait 1 second don't put into.qsize:[%d]."
                            % self._crawl_queue.qsize())
                        continue
                    else:
                        break
        return 0

    def readdb_hotellist_tocrawl(self, country_id=0, city_id=0, hotel_id=0):
        '''
                     读取酒店的列表进行爬行处理 
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_hotel_url(get_list, \
                                                             self._crawl_website_id, \
                                                             country_id,\
                                                             city_id,
                                                             hotel_id,\
                                                             0,\
                                                             start,\
                                                             Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)

            len_ary = len(get_list)
            start += len_ary
            self._logger.info(
                "Hotel data get all[%d] this time [%d] array to process." %
                (start, len_ary))
            if (len_ary == 0):
                self._logger.info("Hotel data array process done.!")
                break
            for item in get_list:
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.put(item, timeout=1)
                    except Queue.Full:
                        self._logger.info(
                            "Queue is full.wait 1 second don't put into.qsize:[%d]."
                            % self._crawl_queue.qsize())
                        continue
                    else:
                        break

        return 0

    def exit_crawl_thread(self, run_sign):
        '''
                    退出爬取的线程
        '''
        assert(run_sign == Crawl_Website_Thread.CRAWL_SIGN_DONE_EXIT or \
               run_sign == Crawl_Website_Thread.CRAWL_SIGN_FORCE_EXIT )

        for crawl_thread in self._crawl_task_list:
            crawl_thread.set_run_sign(run_sign)

        # 等待所有的线程退出
        for crawl_thread in self._crawl_task_list:
            crawl_thread.join()
        return 0

    def wait_thread_exit(self):
        # 通知并且等待所有线程处理完成后退出
        self.exit_crawl_thread(Crawl_Website_Thread.CRAWL_SIGN_DONE_EXIT)
        return 0

    @staticmethod
    def exit_handler(signum, frame):
        '''
                    信号退出的处理方法,Wiondows下就是Ctrl +C 
        '''
        instance().exit_crawl_thread(
            Crawl_Website_Thread.CRAWL_SIGN_FORCE_EXIT)

    @staticmethod
    def instance():
        '''
                    单子实例函数
        '''
        if (Spider_Schedule_Manager.instance_ == None):
            Spider_Schedule_Manager.instance_ = Spider_Schedule_Manager()
        return Spider_Schedule_Manager.instance_
class Spider_Schedule_Manager:
    '''
           爬虫的调度管理器 
    '''
    instance_ = None
    
    #队列最大长度
    CRAWL_QUEUE_SIZE      = 512
    #一次取得MYSQL队列数据
    ONCE_QUERY_ARRAY_SIZE = 256
    
    def __init__(self):
        self._db_ipaddress = ""
        self._db_user = ""
        self._db_password = ""
        self._db_port = 3306
        self._mysql_process = None
        
        
        self._crawl_website_id = Spider_Website_Base.WEBSITE_ID_BOOKING
        
        self._crawl_queue = None
        
        self._crawl_task_num = 0
        self._crawl_task_list = []
        
        self._filesave_dir = ""
        
        #日志模块,免得每个人都去折腾了        
        self._logger = logging.getLogger()
        

        
    def init(self,\
             crawl_website_id,\
             crawl_task_num,\
             saveto_db,\
             filesave_dir,\
             db_ipaddress,\
             db_usr,\
             db_password,\
             db_port=3306):
        '''
                    初始化,确定调度器如何工作
        '''
        ret = 0
        self._db_ipaddress = db_ipaddress
        self._db_user = db_usr
        self._db_password = db_password
        self._db_port = db_port
        
        self._crawl_website_id = crawl_website_id
        
        if (not os.path.exists("./%s/"%self._crawl_website_id)):
            os.mkdir("./%s/"%self._crawl_website_id)
        
        self._saveto_db = saveto_db
        self._filesave_dir = filesave_dir
        
        #为了多线程考虑,用累成员对象
        self._mysql_process = Spider_MySQL_DBProcess()
        self._mysql_process.set_db_info(db_ipaddress=self._db_ipaddress,\
                                        db_usr=self._db_user,\
                                        db_password=self._db_password,
                                        db_port=self._db_port)
        
        ret = self._mysql_process.connect_to_db()
        if (ret != 0):
            return ret
        
        self._crawl_queue = Queue.Queue(Spider_Schedule_Manager.CRAWL_QUEUE_SIZE)
        
        #Windows下的Ctrl+C 的处理
        signal.signal(signal.SIGINT, Spider_Schedule_Manager.exit_handler)
        
        self._crawl_task_num = crawl_task_num
        
        task_count = 0
        while task_count < self._crawl_task_num :
            
            crawl_task = Crawl_Website_Thread(self.create_crawl_website(),\
                                              self._crawl_queue,\
                                              self._saveto_db)
            crawl_task.start()
            self._crawl_task_list.append(crawl_task)
            task_count+=1
        return 0
        
        
    def create_crawl_website(self):
        '''
        
        '''
        mysql_hdl = Spider_MySQL_DBProcess()
        mysql_hdl.set_db_info(db_ipaddress=self._db_ipaddress,\
                              db_usr=self._db_user,\
                              db_password=self._db_password,
                              db_port=self._db_port)
        ret = mysql_hdl.connect_to_db()
        if (ret != 0):
            return None
        crawl_base = None
        if (self._crawl_website_id == Spider_Website_Base.WEBSITE_ID_BOOKING):
            crawl_base = Spider_Website_Booking(mysql_hdl,self._filesave_dir)
        else :
            assert(False)
        return crawl_base
        
    
    def readdb_countrylist_tocrawl(self,country_id = 0):
        '''
                    从DB读取国际的信息(列表),同时对他们进行爬取  
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_country_url(get_list, \
                                                               self._crawl_website_id,\
                                                               country_id,
                                                               0,
                                                               0,
                                                               start,
                                                               Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)
            
            
            if (ret != 0):
                return ret
            
            len_ary = len(get_list)
            start += len_ary
            self._logger.info("Country data get all[%d] this time [%d] array to process."%(start,len_ary))
            if ( len_ary == 0):
                self._logger.info("Country data array process done.!")
                break
            for item in get_list :
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.pub(item,timeout=1)
                    except Queue.Full:
                        self._logger.info("Queue is full.wait 1 second don't put into.qsize:[%d]."%self._crawl_queue.qsize())
                        continue
                    else:
                        break
        return 0
    
    def readdb_citylist_tocrawl(self,country_id=0,city_id=0,saveto_db=False):
        ''' 
                    读取城市的列表进行爬行处理 
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_city_url(get_list, \
                                                            self._crawl_website_id, \
                                                            country_id,\
                                                            city_id,\
                                                            0,\
                                                            start,\
                                                            Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)
            if (ret != 0):
                return ret
        
            len_ary = len(get_list)
            start += len_ary
            self._logger.info("City data get all[%d] this time [%d] array to process."%(start,len_ary))
            if ( len_ary == 0):
                self._logger.info("City data array process done.!")
                break
            for item in get_list :
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.put(item,timeout=1)
                    except Queue.Full:
                        self._logger.info("Queue is full.wait 1 second don't put into.qsize:[%d]."%self._crawl_queue.qsize())
                        continue
                    else:
                        break
        return 0            
    
    def readdb_hotellist_tocrawl(self,country_id=0,city_id = 0,hotel_id=0):
        '''
                     读取酒店的列表进行爬行处理 
        '''
        start = 0
        while (True):
            ret = 0
            get_list = []
            ret = self._mysql_process.list_website_hotel_url(get_list, \
                                                             self._crawl_website_id, \
                                                             country_id,\
                                                             city_id,
                                                             hotel_id,\
                                                             0,\
                                                             start,\
                                                             Spider_Schedule_Manager.ONCE_QUERY_ARRAY_SIZE)
        
            len_ary = len(get_list)
            start += len_ary
            self._logger.info("Hotel data get all[%d] this time [%d] array to process."%(start,len_ary))
            if ( len_ary == 0):
                self._logger.info("Hotel data array process done.!")
                break
            for item in get_list :
                #努力讲数据放到消息队列
                while (True):
                    try:
                        self._crawl_queue.put(item,timeout=1)
                    except Queue.Full:
                        self._logger.info("Queue is full.wait 1 second don't put into.qsize:[%d]."%self._crawl_queue.qsize())
                        continue
                    else:
                        break
        
        
        return 0
    
    def exit_crawl_thread(self,run_sign):
        '''
                    退出爬取的线程
        '''
        assert(run_sign == Crawl_Website_Thread.CRAWL_SIGN_DONE_EXIT or \
               run_sign == Crawl_Website_Thread.CRAWL_SIGN_FORCE_EXIT )
        
        for crawl_thread in self._crawl_task_list :
            crawl_thread.set_run_sign(run_sign)
        
        # 等待所有的线程退出
        for crawl_thread in self._crawl_task_list :
            crawl_thread.join()
        return 0    
    
    def wait_thread_exit(self):
        # 通知并且等待所有线程处理完成后退出 
        self.exit_crawl_thread(Crawl_Website_Thread.CRAWL_SIGN_DONE_EXIT)
        return 0
         
    @staticmethod
    def exit_handler(signum, frame):
        '''
                    信号退出的处理方法,Wiondows下就是Ctrl +C 
        '''
        instance().exit_crawl_thread(Crawl_Website_Thread.CRAWL_SIGN_FORCE_EXIT)
    
    @staticmethod
    def instance():
        '''
                    单子实例函数
        '''
        if (Spider_Schedule_Manager.instance_  == None ):
            Spider_Schedule_Manager.instance_ = Spider_Schedule_Manager()
        return Spider_Schedule_Manager.instance_