Ejemplo n.º 1
0
    def __init__(self,
                 name="Simpyder",
                 gen_url=None,
                 parse=None,
                 save=None,
                 config=SimpyderConfig()):
        # 配置Session,复用TCP连接
        self.session = requests.session()
        self.session.mount('http://', HTTPAdapter(max_retries=3))
        self.session.mount('https://', HTTPAdapter(max_retries=3))

        # 载入配置
        self.config = config

        # 载入主线程日志记录
        self.logger = _get_logger("{} - 主线程".format(name),
                                  self.config.LOG_LEVEL)

        # 构造函数组装
        self.assemble(gen_url, parse, save)

        self.QUEUE_LEN = self.config.PARSE_THREAD_NUMER * 2
        self.url_queue = queue.Queue(self.QUEUE_LEN)
        self.item_queue = queue.Queue(self.QUEUE_LEN)
        self.except_queue = queue.Queue(1)
        self.queueLock = threading.Lock()
        self.threads = []
        self.name = name
        self._saving = False
Ejemplo n.º 2
0
    def run(self):

        self.__apply_config()

        print("""
=======================================================
       _____ _                           __         
      / ___/(_)___ ___  ____  __  ______/ /__  _____
      \__ \/ / __ `__ \/ __ \/ / / / __  / _ \/ ___/
     ___/ / / / / / / / /_/ / /_/ / /_/ /  __/ /    
    /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/     
                    /_/    /____/   version: {}      
=======================================================
        """.format(__VERSION__))

        self.logger.critical("Simpyder ver.{}".format(__VERSION__))
        self.logger.critical("启动爬虫任务")
        meta = {'link_count': 0, 'item_count': 0}
        start_time = datetime.datetime.now()
        meta['start_time'] = start_time
        self.meta = meta
        info_thread = threading.Thread(target=self.__get_info, name="状态打印线程")
        info_thread.setDaemon(True)
        info_thread.start()
        save_thread = threading.Thread(target=self.__run_save, name="保存项目线程")
        save_thread.setDaemon(True)
        save_thread.start()
        for i in range(self.PARSE_THREAD_NUMER):
            self.threads.append(
                self.ParseThread('{} - 子线程 - No.{}'.format(self.name, i),
                                 self.url_queue, self.queueLock,
                                 self.get_response, self.parse, self.save,
                                 self.except_queue, self.item_queue, meta))
        for each_thread in self.threads:
            each_thread.setDaemon(True)
            each_thread.start()
        url_gener = self.gen_url()
        for each_url in url_gener:
            # self.queueLock.acquire()
            if (self.url_queue.full()):
                # self.queueLock.release()
                sleep(0.1)
            else:
                self.url_queue.put(each_url)
                # self.queueLock.release()

        while self.url_queue.empty() == False:
            if self.except_queue.empty() == False:
                except_info = self.except_queue.get()
                self.logger = _get_logger(self.NAME)
                self.logger.error(except_info)
                # for each_thread in self.threads:
                #     each_thread.join()
                break
            pass
            sleep(1)
        self.logger.critical("爬取完毕")
        self.logger.critical("合计爬取项目数:{}".format(meta["item_count"]))
        self.logger.critical("合计爬取链接数:{}".format(meta["link_count"]))
Ejemplo n.º 3
0
 def __run_save(self):
     logger = _get_logger("{} - 子线程 - SAVE".format(self.name), 'INFO')
     while True:
         if not self.item_queue.empty():
             self.save(self.item_queue.get())
             self.meta['item_count'] += 1
         else:
             sleep(0.1)
Ejemplo n.º 4
0
 def __init__(self, name, url_queue, queueLock, get_response, parse,
              save, except_queue, item_queue, meta):
     threading.Thread.__init__(self, target=self.run)
     self.name = name
     self.url_queue = url_queue
     self.queueLock = queueLock
     self.get_response = get_response
     self.parse = parse
     self.save = save
     self.item_queue = item_queue
     self.except_queue = except_queue
     self.logger = _get_logger(self.name)
     self.meta = meta
Ejemplo n.º 5
0
    def run(self):
        self.logger = _get_logger("{}".format(self.name), self.log_level)
        print("""\033[0;32m
   _____ _  Author: Jannchie         __
  / ___/(_)___ ___  ____  __  ______/ /__  _____
  \__ \/ / __ `__ \/ __ \/ / / / __  / _ \/ ___/
 ___/ / / / / / / / /_/ / /_/ / /_/ /  __/ /
/____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/
                /_/    /____/  version: {}\033[0m """.format(__VERSION__))
        self.logger.critical("user_agent: %s" % self.user_agent)
        self.logger.critical("concurrency: %s" % self.concurrency)
        self.logger.critical("interval: %s" % self.interval)
        self.proxy_gener = self.gen_proxy()
        self.loop = asyncio.get_event_loop()
        self.loop.run_until_complete(self._run())
        self.loop.close()
Ejemplo n.º 6
0
    def __init__(self,
                 gen_url=None,
                 parse=None,
                 save=None,
                 config=SimpyderConfig(),
                 name="Simpyder"):
        self.logger = _get_logger("{} - 主线程".format(name))
        self.assemble(gen_url, parse, save)
        self.config = config

        self.QUEUE_LEN = 1000
        self.url_queue = queue.Queue(self.QUEUE_LEN)
        self.item_queue = queue.Queue(self.QUEUE_LEN)
        self.except_queue = queue.Queue(1)
        self.queueLock = threading.Lock()
        self.threads = []
        self.name = name
Ejemplo n.º 7
0
 def __run_save(self):
     logger = _get_logger("{} - 子线程 - SAVE".format(self.name),
                          self.config.LOG_LEVEL)
     while True:
         if not self.item_queue.empty():
             try:
                 item = self.item_queue.get()
                 self._saving = True
                 if item == None or item == False:
                     continue
                 item = self.save(item)
             except Exception as e:
                 self.logger.exception(e)
             logger.debug(item)
             self.meta['item_count'] += 1
         else:
             self._saving = False
             sleep(1)
Ejemplo n.º 8
0
 def __get_info(self):
     log = _get_logger("{} - 子线程 - INFO".format(self.name), 'INFO')
     history = []
     interval = 5
     while True:
         c_time = datetime.datetime.now()
         history.append(
             (c_time, self.meta['link_count'], self.meta['item_count']))
         if len(history) > 60 / interval:
             history = history[-12:]
         if (c_time - self.meta['start_time']
             ).total_seconds() % interval < 1 and len(history) > 1:
             delta_link = (history[-1][1] - history[0][1]) * 60 / \
                 (history[-1][0] - history[0][0]).total_seconds()
             delta_item = (history[-1][2] - history[0][2]) * 60 / \
                 (history[-1][0] - history[0][0]).total_seconds()
             log.info("正在爬取第 {} 个链接({}/min),共产生 {} 个对象({}/min)".format(
                 self.meta['link_count'], int(delta_link),
                 self.meta['item_count'], int(delta_item)))
         sleep(1)
Ejemplo n.º 9
0
 def __get_info(self):
     log = _get_logger("{} - 子线程 - INFO".format(self.name),
                       self.config.LOG_LEVEL)
     history = []
     interval = 5
     while True:
         c_time = datetime.datetime.now()
         history.append(
             (c_time, self.meta['link_count'], self.meta['item_count']))
         if len(history) > 60:
             history = history[-60:]
         if (c_time - self.meta['start_time']
             ).total_seconds() % interval < 1 and len(history) > 1:
             delta_link = (history[-interval + 1][1] - history[0][1]) * 60 / \
                 ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1)
             delta_item = (history[-interval + 1][2] - history[0][2]) * 60 / \
                 ((history[-interval + 1][0] - history[0][0]).total_seconds() + 1)
             if (self.config.DOWNLOAD_INTERVAL == 0):
                 load = 100
             else:
                 load = int(
                     (history[-1][1] - history[0][1]) * 60 /
                     (history[-1][0] - history[0][0]).total_seconds() /
                     (60 / (self.config.DOWNLOAD_INTERVAL /
                            self.config.PARSE_THREAD_NUMER)) * 100)
             result = {
                 'computer_name': socket.gethostname(),
                 'spider_name': self.start_time,
                 'start_time': self.start_time,
                 'update_time': datetime.datetime.now(),
                 'load': load,
                 'delta_link': delta_link,
                 'delta_item': delta_item
             },
             log.info(
                 "正在爬取第 {} 个链接({}/min, 负载{}%),共产生 {} 个对象({}/min)".format(
                     self.meta['link_count'], int(delta_link), load,
                     self.meta['item_count'], int(delta_item)))
         sleep(1)
Ejemplo n.º 10
0
    def run(self):
        self.start_time = datetime.datetime.now()
        self._finish = False
        print("""
       _____ _  Author: Jannchie         __         
      / ___/(_)___ ___  ____  __  ______/ /__  _____
      \__ \/ / __ `__ \/ __ \/ / / / __  / _ \/ ___/
     ___/ / / / / / / / /_/ / /_/ / /_/ /  __/ /    
    /____/_/_/ /_/ /_/ .___/\__, /\__,_/\___/_/     
                    /_/    /____/   version: {}      


        """.format(__VERSION__))
        self.__apply_config()

        self.logger.critical("Simpyder ver.{}".format(__VERSION__))
        self.logger.critical("启动爬虫任务")
        meta = {
            'link_count': 0,
            'item_count': 0,
            'thread_number': self.config.PARSE_THREAD_NUMER,
            'download_interval': self.config.DOWNLOAD_INTERVAL
        }
        meta['start_time'] = self.start_time
        self.meta = meta
        info_thread = threading.Thread(target=self.__get_info, name="状态打印线程")
        info_thread.setDaemon(True)
        info_thread.start()
        save_thread = threading.Thread(target=self.__run_save, name="保存项目线程")
        save_thread.setDaemon(True)
        save_thread.start()
        for i in range(self.PARSE_THREAD_NUMER):
            self.threads.append(
                self.ParseThread('{} - 子线程 - No.{}'.format(self.name,
                                                           i), self.url_queue,
                                 self.queueLock, self.get_response, self.parse,
                                 self.save, self.except_queue, self.item_queue,
                                 meta, self.config))
        for each_thread in self.threads:
            each_thread.setDaemon(True)
            each_thread.start()
        url_gener = self.gen_url()
        for each_url in url_gener:
            self.queueLock.acquire()
            while (self.url_queue.full()):
                if self.queueLock.locked():
                    self.logger.debug("队列满: {}".format(each_url))
                    self.queueLock.release()
                sleep(0.1)
            self.logger.debug("加入待爬: {}".format(each_url))
            if self.queueLock.locked():
                self.queueLock.release()

            self.queueLock.acquire()
            self.url_queue.put(each_url)
            self.queueLock.release()

        self.logger.info("全部请求完毕,等待解析进程")
        while self.url_queue.empty() == False or self.item_queue.empty(
        ) == False or self._saving == True:
            if self.except_queue.empty() == False:
                except_info = self.except_queue.get()
                self.logger = _get_logger(self.name, self.config.LOG_LEVEL)
                self.logger.error(except_info)
                # for each_thread in self.threads:
                #     each_thread.join()
                break
            pass
        self.logger.critical("全部解析完毕,等待保存进程")
        self._finish = True
        save_thread.join()
        self.logger.critical("合计爬取项目数:{}".format(meta["item_count"]))
        self.logger.critical("合计爬取链接数:{}".format(meta["link_count"]))