Example #1
0
class Spider:
    '''一只小爬虫'''
    def __init__(self):
        self.rsp = Response()
        self.av_queue = queue.Queue()
        self.max_threads = 5

    def run(self, av_queue):
        tt_name = threading.current_thread().getName()
        while not av_queue.empty():
            jsurl, menu_title, movieName = self.av_queue.get()
            m3u8Url = self.rsp.get_m3u8_url(jsurl)
            log.debug("{} 开始爬取视频:{}".format(tt_name, m3u8Url))

            try:
                # log.info(finalUrl)
                M3u8Assembly().download(m3u8Url, menu_title, movieName)
            except Exception as e:
                log.error("爬取失败 {}".format(e))

    def spider_enqueue(self):
        start = datetime.datetime.now().replace(microsecond=0)
        menuList = self.rsp.get_av_menu_bar(self.rsp.get_base_url())
        for menu in menuList[0:2]:
            menu_title = menu.get("title")
            menuUrl = menu.get("href")
            log.info("{} {}".format(menu_title, menuUrl))
            avList = self.rsp.get_av_list_info(menuUrl)

            for av in avList[0:3]:
                url = self.rsp.get_av_url(av.get("href"))
                name = av.get("title")
                log.debug("爬取:{}  {}".format(menu_title, name))
                jsurl = self.rsp.get_js_url(url)
                self.av_queue.put([jsurl, menu_title.strip(), name.strip()])
        return self.av_queue

    def start(self):
        av_queue = self.spider_enqueue()
        threadPools = []
        for i in range(self.max_threads):
            t = threading.Thread(target=self.run,
                                 name='AV-' + str(i),
                                 kwargs={'av_queue': av_queue})
            threadPools.append(t)
        for t in threadPools:
            t.start()
        for t in threadPools:
            t.join()