Beispiel #1
0
    def init(self):

        def get_url(list):
            '''
            描述: 将爬虫获取到的request列表中的url提取出来,并且格式化与去重复
            :param list:
            :return:
            '''
            tempList = []
            for item in list:
                url = item.url
                if url and url[-1] == '/':
                    url = url[:-1]
                tempList.append(url)
            return set(tempList)

        self.detectTM.setMaxThreads(10)     # 设置可以同时进行任务的个数


        from sinbot import sinbot_start     # 引入sinbot_start方法
        from settings.settings import settings as st # 引入sinbot_settings方法
        st.set('DEPTH_LIMIT', settings.getint('DEPTH_LIMIT'))    # 设置检测层数, 此处设置为2表示3层,从0开始计数
        reqList = sinbot_start(self.url)      # 开始爬取结果
        self.urlList = get_url(reqList)    # 将爬取到的url结果保存到列表中
        logger.info('Detect modules complete initialization...')
Beispiel #2
0
 def _initPool(self):
     if not self._initialized:
         self._maxThreads = settings.getint('THREAD_MAX') or 5
         self._queueSize = settings.getint('QUEUE_SIZE') or 200
         self._threadPool = ThreadPool(self._queueSize, self._maxThreads)
         self._initialized = True
Beispiel #3
0
 def _initPool(self):
     if not self._initialized:
         self._maxThreads = settings.getint('THREAD_MAX') or 5
         self._queueSize = settings.getint('QUEUE_SIZE') or 200
         self._threadPool = ThreadPool(self._queueSize, self._maxThreads)
         self._initialized = True