import aiohttp from Spider import Request from Spider import Spider import PageParse import argparse ARGS = argparse.ArgumentParser(description="caoliu spider") ARGS.add_argument("--pages", action='store', type=int, default=1, help='Limit page to spider') ARGS.add_argument("--max_tries", action='store', type=int, default=30, help='Limit retries on network errors') ARGS.add_argument("--root_dir", action='store', default='./download', help='directory store picture and torrent') ARGS.add_argument("--max_tasks", action='store', type=int, default=20, help='Limit concurrent connections') ROOT_DIR = "/media/mosaic/软件/git-myspider/cl_spider/source/" args = ARGS.parse_args() loop = asyncio.get_event_loop() spider = Spider(max_tries=args.max_tries, max_tasks=args.max_tasks) PageParse.start(spider, 1, args.pages+1, root_dir=args.root_dir) loop.run_until_complete(spider.spider()) spider.close() loop.stop() loop.run_forever() loop.close()
def __init__(self, spider, url, path, params=None): self.path = path Request.__init__(self, spider=spider, url=url, params=params, content_type="binary") def handle_func(self, content): with open(self.path, "wb") as f: f.write(content) print("%s============>下载完成!" % (self.path)) def start(spider, start_page=1, end_page=2, root_dir="./"): print("enter %s" % root_dir) os.chdir(root_dir) for i in range(start_page, end_page)[::-1]: request = IndexPageRequest(spider, CL_URL % i) if __name__ == "__main__": loop = asyncio.get_event_loop() spider = Spider(page_num=5, max_tries=30, max_tasks=20, rootDir='./source') request = IndexPageRequest(spider, CL_URL % 1) loop.run_until_complete(spider.spider()) spider.close() loop.stop() loop.run_forever() loop.close()