def main(): asyncio.set_event_loop(None) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() else: loop = asyncio.new_event_loop() sslctx = None if args.tls: import ssl # TODO: take cert/key from args as well. here = os.path.join(os.path.dirname(__file__), '..', 'tests') sslctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) sslctx.options |= ssl.OP_NO_SSLv2 sslctx.load_cert_chain( certfile=os.path.join(here, 'ssl_cert.pem'), keyfile=os.path.join(here, 'ssl_key.pem')) cache = Cache(loop) task = asyncio.streams.start_server(cache.handle_client, args.host, args.port, ssl=sslctx, loop=loop) svr = loop.run_until_complete(task) for sock in svr.sockets: logging.info('socket %s', sock.getsockname()) try: loop.run_forever() finally: loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host="localhost", port=6379, db=0) data = [r.blpop("queue:urls_to_crawl")] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None # Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler( roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print("\nInterrupted\n") finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host='localhost', port=6379, db=0) data = [r.blpop('queue:urls_to_crawl')] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None #Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler(roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, proxy=args.proxy, loop=loop, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') with open("{}.log".format(now), "w") as f: reporting.report(crawler, file=f) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # ERROR=0 WARN=1 越小越严重 logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) # 以下条件语句内区分了不同的循环方式,IOCP,select等,涉及系统底层socket操作,代码层面略。 if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: # 效率较低 loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # 默认循环方式 roots = {fix_url(root) for root in args.roots} # args.roots is a list crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() # 清理内存 print('\nInterrupted\n') finally: reporting.report(crawler) # 打印爬取结果,或输出结果到文件 crawler.close() # aiohttp loop close # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() # clean up process loop.close() # 移除signal处理器
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() # if not args.roots: # print('Use --help for command line help') # return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} roots = {'https://sauna.ru/'} crawler = crawling.Crawler( roots, exclude='\/(review|news|addFavorite|panorama|comment)', strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, scrape_nonhtml=False, ) try: loop.run_until_complete(crawler.dbconnect()) loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)]) logging.basicConfig(filename='crawl.log',level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): ''' Main program. Parse arguments, set up event loop, run crawler, print report. ''' args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if args.out: f = open(args.out, 'w') else: f = None roots = {fix_url(root) for root in args.roots} try: loop.run_until_complete( run_crawler(loop=loop, roots=roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, login_url=args.login_url, login_data=args.login_data, file=f)) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: loop.stop() loop.run_forever() loop.close() if f is not None: f.close()
def main(): args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = {"User-Agent": config['client']['user-agent']} # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format( config['client']['refresh']), headers=headers).json() headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format( parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format( "+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = { "User-Agent": config['client']['user-agent'] } # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format(config['client']['refresh']), headers=headers).json() headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format(parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format("+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): setup_log(logging.INFO, os.path.join(os.path.abspath('.'), 'logs', 'look_ua.log')) source_urls = [ # ('https://www.look.com.ua/love/page/{}/', 42), # ('https://www.look.com.ua/spring/page/{}/', 94), # ('https://www.look.com.ua/autumn/page/{}/', 99), # ('https://www.look.com.ua/hi-tech/page/{}/', 114), # ('https://www.look.com.ua/summer/page/{}/', 119), # ('https://www.look.com.ua/newyear/page/{}/', 156), # ('https://www.look.com.ua/men/page/{}/', 157), # ('https://www.look.com.ua/holidays/page/{}/', 159), # ('https://www.look.com.ua/creative/page/{}/', 168), # ('https://www.look.com.ua/winter/page/{}/', 172), # ('https://www.look.com.ua/situation/page/{}/', 172), # ('https://www.look.com.ua/music/page/{}/', 184), # ('https://www.look.com.ua/food/page/{}/', 211), # ('https://www.look.com.ua/weapon/page/{}/', 217), # ('https://www.look.com.ua/aviation/page/{}/', 261), # ('https://www.look.com.ua/textures/page/{}/', 267), # ('https://www.look.com.ua/minimalism/page/{}/', 278), # ('https://www.look.com.ua/movies/page/{}/', 280), # ('https://www.look.com.ua/3d/page/{}/', 286), # ('https://www.look.com.ua/abstraction/page/{}/', 293), # ('https://www.look.com.ua/space/page/{}/', 302), # ('https://www.look.com.ua/sport/page/{}/', 307), # ('https://www.look.com.ua/mood/page/{}/', 422), # ('https://www.look.com.ua/flowers/page/{}/', 595), # ('https://www.look.com.ua/macro/page/{}/', 636), # ('https://www.look.com.ua/travel/page/{}/', 674), # ('https://www.look.com.ua/fantasy/page/{}/', 687), # ('https://www.look.com.ua/anime/page/{}/', 694), # ('https://www.look.com.ua/games/page/{}/', 720), # ('https://www.look.com.ua/other/page/{}/', 778), # ('https://www.look.com.ua/animals/page/{}/', 1103), # ('https://www.look.com.ua/landscape/page/{}/', 1140), # ('https://www.look.com.ua/nature/page/{}/', 1142), # ('https://www.look.com.ua/auto/page/{}/', 1559), # ('https://www.look.com.ua/girls/page/{}/', 9266), ] # loop = asyncio.get_event_loop() loop = ProactorEventLoop() asyncio.set_event_loop(loop) crawler = Crawler(max_tries=5, max_tasks=30) for info in source_urls: for i in range(1, info[1] + 1): json_data = { 'url': info[0].format(i), } crawler.insert_task(crawler.start_page_key, json.dumps(json_data)) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() logging.warning('\nInterrupted\n') finally: loop.run_until_complete(crawler.close()) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def run(): setup_log(logging.INFO, os.path.join(os.path.abspath('.'), 'logs', 'look_ua.log')) source_urls = [ ('https://www.look.com.ua/love/page/{}/', 42), # ('https://www.look.com.ua/spring/page/{}/', 94), # ('https://www.look.com.ua/autumn/page/{}/', 99), # ('https://www.look.com.ua/hi-tech/page/{}/', 114), # ('https://www.look.com.ua/summer/page/{}/', 119), # ('https://www.look.com.ua/newyear/page/{}/', 156), # ('https://www.look.com.ua/men/page/{}/', 157), # ('https://www.look.com.ua/holidays/page/{}/', 159), # ('https://www.look.com.ua/creative/page/{}/', 168), # ('https://www.look.com.ua/winter/page/{}/', 172), # ('https://www.look.com.ua/situation/page/{}/', 172), # ('https://www.look.com.ua/music/page/{}/', 184), # ('https://www.look.com.ua/food/page/{}/', 211), # ('https://www.look.com.ua/weapon/page/{}/', 217), # ('https://www.look.com.ua/aviation/page/{}/', 261), # ('https://www.look.com.ua/textures/page/{}/', 267), # ('https://www.look.com.ua/minimalism/page/{}/', 278), # ('https://www.look.com.ua/movies/page/{}/', 280), # ('https://www.look.com.ua/3d/page/{}/', 286), # ('https://www.look.com.ua/abstraction/page/{}/', 293), # ('https://www.look.com.ua/space/page/{}/', 302), # ('https://www.look.com.ua/sport/page/{}/', 307), # ('https://www.look.com.ua/mood/page/{}/', 422), # ('https://www.look.com.ua/flowers/page/{}/', 595), # ('https://www.look.com.ua/macro/page/{}/', 636), # ('https://www.look.com.ua/travel/page/{}/', 674), # ('https://www.look.com.ua/fantasy/page/{}/', 687), # ('https://www.look.com.ua/anime/page/{}/', 694), # ('https://www.look.com.ua/games/page/{}/', 720), # ('https://www.look.com.ua/other/page/{}/', 778), # ('https://www.look.com.ua/animals/page/{}/', 1103), # ('https://www.look.com.ua/landscape/page/{}/', 1140), # ('https://www.look.com.ua/nature/page/{}/', 1142), # ('https://www.look.com.ua/auto/page/{}/', 1559), # ('https://www.look.com.ua/girls/page/{}/', 9266), ] if sys.platform == 'win32': loop = ProactorEventLoop() else: loop = asyncio.get_event_loop() asyncio.set_event_loop(loop) redis_key = 'look_ua' crawler = Crawler(redis_key, max_tasks=1000, store_path='D:\\download\\') for info in source_urls: for i in range(1, info[1] + 1): json_data = { 'url': info[0].format(i), 'type_': 'text', 'operate_func': 'parse_detail_task', } crawler.insert_task(json.dumps(json_data)) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() logging.warning('\nInterrupted\n') finally: loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()