self.q.put_nowait(url)

    async def crawl(self):
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        await self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()


if __name__ == '__main__':
    logger = create_logging('链家URL', 'logurl.log')
    write_header_csv()
    # 琼海没有信息
    # 苏州没有信息
    # 石家庄没有信息
    # 沈阳没有信息
    # 三亚没有信息
    # 文昌没有信息
    # 万宁没有信息
    # 海口没有信息
    # 西安没有信息
    # 陵水没有信息
    # 廊坊燕郊没有信息
    URLs = [
        'http://bj.lianjia.com/zufang/pg{}/',
        'http://nj.lianjia.com/zufang/pg{}/',
            self.q.task_done()

    def add_url(self, url):
        if url not in self.seen_urls:
            self.seen_urls.add(url)
            self.q.put_nowait(url)

    async def crawl(self):
        workers = [
            asyncio.Task(self.work(), loop=self.loop)
            for _ in range(self.max_tasks)
        ]
        self.t0 = time.time()
        await self.q.join()
        self.t1 = time.time()
        for w in workers:
            w.cancel()


if __name__ == '__main__':
    logger = create_logging('豆瓣list', 'loggerlist.log')
    URL = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
    loop = asyncio.get_event_loop()
    crawler = Crawler(max_tasks=10)
    crawler.add_url(URL)
    loop.run_until_complete(crawler.crawl())
    print('Finished in {:.3f} seconds'.format(crawler.t1 - crawler.t0))
    print('一共抓取网页--->', len(crawler.seen_urls))
    crawler.close()
    loop.close()
                await self.fetch(msg.body)
                msg.ack()
                self.queue.task_done()
        except asyncio.CancelledError:
            pass

        except Exception as e:
            self.queue.task_done()
            raise

    def run(self):
        reconnect_task = self.loop.create_task(reconnector(self.queue, 'tags'))
        process_task = [
            self.loop.create_task(self.process_msgs(self.queue))
            for _ in range(self.max_tasks)
        ]
        try:
            self.loop.run_forever()
        except KeyboardInterrupt:
            process_task.cancel()
            reconnect_task.cancel()
            self.loop.run_until_complete(process_task)
            self.loop.run_until_complete(reconnect_task)
        self.loop.close()


if __name__ == "__main__":
    crawl = Crawler(max_tasks=1)
    logger = create_logging('豆瓣tag', 'loggertag.log')
    crawl.run()
                msg = await queue.get()
                await self.fetch(msg.body)
                msg.ack()
                # asyncio.sleep(5)
        except asyncio.CancelledError:
            pass

        except Exception as e:
            self.q.task_done()

    def run(self):
        reconnect_task = self.loop.create_task(reconnector(self.queue))
        process_task = [
            self.loop.create_task(self.process_msgs(self.queue))
            for _ in range(self.max_tasks)
        ]
        try:
            self.loop.run_forever()
        except KeyboardInterrupt:
            process_task.cancel()
            reconnect_task.cancel()
            self.loop.run_until_complete(process_task)
            self.loop.run_until_complete(reconnect_task)
        self.loop.close()


if __name__ == "__main__":
    logger = create_logging('链家ITEM', 'logitem.log')
    crawl = CrawlItem(max_tasks=10)
    crawl.run()
Exemple #5
0
                msg.ack()
                self.queue.task_done()
        except asyncio.CancelledError:
            pass

        except Exception as e:
            self.queue.task_done()
            raise

    def run(self):
        reconnect_task = self.loop.create_task(reconnector(
            self.queue, 'items'))
        process_task = [
            self.loop.create_task(self.process_msgs(self.queue))
            for _ in range(self.max_tasks)
        ]
        try:
            self.loop.run_forever()
        except KeyboardInterrupt:
            process_task.cancel()
            reconnect_task.cancel()
            self.loop.run_until_complete(process_task)
            self.loop.run_until_complete(reconnect_task)
        self.loop.close()


if __name__ == "__main__":
    crawl = Crawler(max_tasks=10)
    logger = create_logging('豆瓣item', 'loggeritem.log')
    crawl.run()