def main(): asyncio.set_event_loop(None) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() else: loop = asyncio.new_event_loop() sslctx = None if args.tls: import ssl # TODO: take cert/key from args as well. here = os.path.join(os.path.dirname(__file__), '..', 'tests') sslctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) sslctx.options |= ssl.OP_NO_SSLv2 sslctx.load_cert_chain( certfile=os.path.join(here, 'ssl_cert.pem'), keyfile=os.path.join(here, 'ssl_key.pem')) cache = Cache(loop) task = asyncio.streams.start_server(cache.handle_client, args.host, args.port, ssl=sslctx, loop=loop) svr = loop.run_until_complete(task) for sock in svr.sockets: logging.info('socket %s', sock.getsockname()) loop.run_forever()
def main(): asyncio.set_event_loop(None) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() else: loop = asyncio.new_event_loop() sslctx = None if args.tls: import ssl # TODO: take cert/key from args as well. here = os.path.join(os.path.dirname(__file__), '..', 'tests') sslctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) sslctx.options |= ssl.OP_NO_SSLv2 sslctx.load_cert_chain(certfile=os.path.join(here, 'ssl_cert.pem'), keyfile=os.path.join(here, 'ssl_key.pem')) cache = Cache(loop) task = asyncio.streams.start_server(cache.handle_client, args.host, args.port, ssl=sslctx, loop=loop) svr = loop.run_until_complete(task) for sock in svr.sockets: logging.info('socket %s', sock.getsockname()) loop.run_forever()
def main(): if '--iocp' in sys.argv: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() set_event_loop(loop) else: loop = get_event_loop() try: body = loop.run_until_complete(fetch(sys.argv[1], '-v' in sys.argv)) finally: loop.close() sys.stdout.buffer.write(body)
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host='localhost', port=6379, db=0) data = [r.blpop('queue:urls_to_crawl')] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None #Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler(roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): global args args = ARGS.parse_args() if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() set_event_loop(loop) else: loop = get_event_loop() try: loop.run_until_complete(start(loop, args.host, args.port)) finally: loop.close()
def test(self): """ Play PyChess-PyChess 1 min variant games """ if sys.platform == "win32": from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) loop = asyncio.get_event_loop() loop.set_debug(enabled=True) for vari in PYCHESS_VARIANTS: variant = variants[vari] def coro(): self.p0 = yield from discoverer.initEngine(self.engine, WHITE, False) self.p1 = yield from discoverer.initEngine(self.engine, BLACK, False) loop.run_until_complete(coro()) def optionsCallback(engine): engine.setOptionVariant(variant) engine.setOptionStrength(1, False) engine.setOptionTime(60, 0, 0) self.p0.connect("readyForOptions", optionsCallback) self.p1.connect("readyForOptions", optionsCallback) def coro(variant): self.game = GameModel(TimeModel(60, 0), variant) self.game.setPlayers([self.p0, self.p1]) def on_game_end(game, state, event): event.set() event = asyncio.Event() self.game.connect("game_ended", on_game_end, event) self.p0.prestart() self.p1.prestart() if self.game.variant.need_initial_board: for player in self.game.players: player.setOptionInitialBoard(self.game) print(variant.name) self.game.start() yield from event.wait() pgn = StringIO() print(save(pgn, self.game)) self.assertIsNone(self.p0.invalid_move) self.assertIsNone(self.p1.invalid_move) loop.run_until_complete(coro(variant))
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, proxy=args.proxy, loop=loop, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S') with open("{}.log".format(now), "w") as f: reporting.report(crawler, file=f) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if not args.roots: r = redis.StrictRedis(host="localhost", port=6379, db=0) data = [r.blpop("queue:urls_to_crawl")] # data.append(r.blpop('queue:urls_to_crawl')) # data.append(r.blpop('queue:urls_to_crawl')) roots, scrape_data = init_data(data) s = None # Scraper(scrape_data) else: roots = {fix_url(root) for root in args.roots} s = None crawler = crawling.Crawler( roots, scraper=s, data_handler=None, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print("\nInterrupted\n") finally: reporting.report(crawler) ########## REPORTING crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] # ERROR=0 WARN=1 越小越严重 logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) # 以下条件语句内区分了不同的循环方式,IOCP,select等,涉及系统底层socket操作,代码层面略。 if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: # 效率较低 loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # 默认循环方式 roots = {fix_url(root) for root in args.roots} # args.roots is a list crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() # 清理内存 print('\nInterrupted\n') finally: reporting.report(crawler) # 打印爬取结果,或输出结果到文件 crawler.close() # aiohttp loop close # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() # clean up process loop.close() # 移除signal处理器
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() print(args) if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) #实际未打印结果 if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} #创建crawler实例时,实现了add_url操作 crawler = Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup #loop.stop() #loop.run_forever() loop.close()
def main(): ''' Main program. Parse arguments, set up event loop, run crawler, print report. ''' args = ARGS.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() if args.out: f = open(args.out, 'w') else: f = None roots = {fix_url(root) for root in args.roots} try: loop.run_until_complete( run_crawler(loop=loop, roots=roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, login_url=args.login_url, login_data=args.login_data, file=f)) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: loop.stop() loop.run_forever() loop.close() if f is not None: f.close()
def get_loop(select=False): if os.name == 'nt': from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() elif select: loop = asyncio.SelectorEventLoop() else: loop = asyncio.get_event_loop() return loop
def main(): asyncio.set_event_loop(None) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() else: loop = asyncio.new_event_loop() sslctx = None if args.tls: sslctx = test_utils.dummy_ssl_context() cache = CacheClient(args.host, args.port, sslctx=sslctx, loop=loop) try: loop.run_until_complete( asyncio.gather( *[testing(i, cache, loop) for i in range(args.ntasks)], loop=loop)) finally: loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return log = Logger(args.level) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = Crawler(log, roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, max_pool=args.max_pool, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: crawler.report() crawler.close() loop.close()
def main(loop=None): """Main program. Parse arguments, set up event loop, run crawler, print report. """ out_loop = True if loop else False args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) root_path = args.roots[0] router = routeing.get_router() logging.debug('resume the loop') router.resume(loop) router.add_root_path(root_path) logging.debug(f'set up router with root_path {root_path}') if not loop: if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() loop.set_exception_handler(exception_handler) roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, loop=loop ) if out_loop: asyncio.ensure_future(crawler.crawl(), loop=loop) asyncio.ensure_future(do_async_report(router, crawler), loop=loop) else: try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: # sys.stderr.flush() pass finally: reporting.report(crawler) print('ALL DONE NOW')
def main(): global args args = ARGS.parse_args() if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() set_event_loop(loop) else: loop = get_event_loop() try: loop.run_until_complete(start(loop, args)) finally: loop.close()
def main(): "Parse arguments, set up event loop, run crawler and print a report." levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] if args["--quiet"]: logging.basicConfig(level=levels[0]) else: logging.basicConfig(level=levels[int(args["--verbose"])]) # Not sure how to set --strict to True by default with docopts. So this is # where we handle strict vs lenient. if args["--lenient"]: args["--strict"] = False else: args["--strict"] = True if args["--iocp"]: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args["--select"]: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() # Set comprehension to avoid redundancy. roots = {fix_url(root) for root in args["<root>"]} # Instantiating the crawler with our arguments. crawler = crawling.Crawler(roots, exclude=args["--exclude"], strict=args["--strict"], max_redirect=int(args["--max-redirect"]), max_tries=int(args["--max-tries"]), max_tasks=int(args["--max-tasks"]), max_pool=int(args["--max-pool"]) ) # "And this is where the magic happens." try: loop.run_until_complete(crawler.crawl()) except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() loop.close()
def setUp(self): if sys.platform == "win32": from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) self.loop = asyncio.get_event_loop() self.loop.set_debug(enabled=True) widgets = uistuff.GladeWidgets("PyChess.glade") gamewidget.setWidgets(widgets) perspective_manager.set_widgets(widgets) self.welcome_persp = Welcome() perspective_manager.add_perspective(self.welcome_persp) self.games_persp = Games() perspective_manager.add_perspective(self.games_persp)
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return log = Logger(args.level) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = Crawler( log, roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, max_pool=args.max_pool, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: crawler.report() crawler.close() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() loop.close()
# get and print lines from stdout, stderr timeout = None while registered: done, pending = yield from asyncio.wait( registered, timeout=timeout, return_when=asyncio.FIRST_COMPLETED) if not done: break for f in done: stream = registered.pop(f) res = f.result() print(name[stream], res.decode('ascii').rstrip()) if res != b'': registered[asyncio.Task(stream.readline())] = stream timeout = 0.0 stdout_transport.close() stderr_transport.close() if __name__ == '__main__': if sys.platform == 'win32': loop = ProactorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() try: loop.run_until_complete(main(loop)) finally: loop.close()
def run(): setup_log(logging.INFO, os.path.join(os.path.abspath('.'), 'logs', 'look_ua.log')) source_urls = [ ('https://www.look.com.ua/love/page/{}/', 42), # ('https://www.look.com.ua/spring/page/{}/', 94), # ('https://www.look.com.ua/autumn/page/{}/', 99), # ('https://www.look.com.ua/hi-tech/page/{}/', 114), # ('https://www.look.com.ua/summer/page/{}/', 119), # ('https://www.look.com.ua/newyear/page/{}/', 156), # ('https://www.look.com.ua/men/page/{}/', 157), # ('https://www.look.com.ua/holidays/page/{}/', 159), # ('https://www.look.com.ua/creative/page/{}/', 168), # ('https://www.look.com.ua/winter/page/{}/', 172), # ('https://www.look.com.ua/situation/page/{}/', 172), # ('https://www.look.com.ua/music/page/{}/', 184), # ('https://www.look.com.ua/food/page/{}/', 211), # ('https://www.look.com.ua/weapon/page/{}/', 217), # ('https://www.look.com.ua/aviation/page/{}/', 261), # ('https://www.look.com.ua/textures/page/{}/', 267), # ('https://www.look.com.ua/minimalism/page/{}/', 278), # ('https://www.look.com.ua/movies/page/{}/', 280), # ('https://www.look.com.ua/3d/page/{}/', 286), # ('https://www.look.com.ua/abstraction/page/{}/', 293), # ('https://www.look.com.ua/space/page/{}/', 302), # ('https://www.look.com.ua/sport/page/{}/', 307), # ('https://www.look.com.ua/mood/page/{}/', 422), # ('https://www.look.com.ua/flowers/page/{}/', 595), # ('https://www.look.com.ua/macro/page/{}/', 636), # ('https://www.look.com.ua/travel/page/{}/', 674), # ('https://www.look.com.ua/fantasy/page/{}/', 687), # ('https://www.look.com.ua/anime/page/{}/', 694), # ('https://www.look.com.ua/games/page/{}/', 720), # ('https://www.look.com.ua/other/page/{}/', 778), # ('https://www.look.com.ua/animals/page/{}/', 1103), # ('https://www.look.com.ua/landscape/page/{}/', 1140), # ('https://www.look.com.ua/nature/page/{}/', 1142), # ('https://www.look.com.ua/auto/page/{}/', 1559), # ('https://www.look.com.ua/girls/page/{}/', 9266), ] if sys.platform == 'win32': loop = ProactorEventLoop() else: loop = asyncio.get_event_loop() asyncio.set_event_loop(loop) redis_key = 'look_ua' crawler = Crawler(redis_key, max_tasks=1000, store_path='D:\\download\\') for info in source_urls: for i in range(1, info[1] + 1): json_data = { 'url': info[0].format(i), 'type_': 'text', 'operate_func': 'parse_detail_task', } crawler.insert_task(json.dumps(json_data)) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() logging.warning('\nInterrupted\n') finally: loop.run_until_complete(crawler.close()) loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = {"User-Agent": config['client']['user-agent']} # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format( config['client']['refresh']), headers=headers).json() headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post( 'https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format( parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update( {"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format( "+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return global config global headers config = configparser.ConfigParser() config.read('client_app.ini') headers = { "User-Agent": config['client']['user-agent'] } # @todo: figure out what to do with these. Currently just for creating the auth URL scopes = [ 'publicData', 'characterContactsRead', 'characterFittingsRead', 'characterLocationRead' ] if args.auth: id = bytes("{}:{}".format(config['client']['Key'], config['client']['secret']), encoding="utf-8") headers.update({ "Authorization": b"Basic " + base64.b64encode(id), "Content-Type": "application/x-www-form-urlencoded" }) if config['client'].get('refresh', None) and not args.invalid: print("Using Refresh token to login") # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=refresh_token&refresh_token={}".format(config['client']['refresh']), headers=headers).json() headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) else: def handleLogin(httpd, parts): # do requests here to get auth/refresh code and stick them in config (save maybe?) r = requests.post('https://login.eveonline.com/oauth/token', data="grant_type=authorization_code&code={}".format(parts['code'][0]), headers=headers).json() config["client"]["refresh"] = r['refresh_token'] with open('client_app.ini', 'w') as configfile: config.write(configfile) headers.update({"Authorization": "Bearer {}".format(r['access_token'])}) httpd.stop() httpd = StoppableHTTPServer(('', 6789), AuthHandler) url = "https://login.eveonline.com/oauth/authorize/?response_type=code&scope={}&redirect_uri=http://localhost:6789/&client_id={}".format("+".join(scopes), config['client']['key']) print("Please go here to authenticate: \n {}".format(url)) httpd.serve(handleLogin) levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels)-1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} crawler = crawling.Crawler(roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, headers=headers, follow_pages=args.follow_pages, ) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() # if not args.roots: # print('Use --help for command line help') # return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} roots = {'https://sauna.ru/'} crawler = crawling.Crawler( roots, exclude='\/(review|news|addFavorite|panorama|comment)', strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, scrape_nonhtml=False, ) try: loop.run_until_complete(crawler.dbconnect()) loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporting.report(crawler) crawler.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
timeout = None while registered: done, pending = yield from asyncio.wait( registered, timeout=timeout, return_when=asyncio.FIRST_COMPLETED) if not done: break for f in done: stream = registered.pop(f) res = f.result() print(name[stream], res.decode('ascii').rstrip()) if res != b'': registered[asyncio.Task(stream.readline())] = stream timeout = 0.0 stdout_transport.close() stderr_transport.close() if __name__ == '__main__': if sys.platform == 'win32': loop = ProactorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() try: loop.run_until_complete(main(loop)) finally: loop.close()
def main(): setup_log(logging.INFO, os.path.join(os.path.abspath('.'), 'logs', 'look_ua.log')) source_urls = [ # ('https://www.look.com.ua/love/page/{}/', 42), # ('https://www.look.com.ua/spring/page/{}/', 94), # ('https://www.look.com.ua/autumn/page/{}/', 99), # ('https://www.look.com.ua/hi-tech/page/{}/', 114), # ('https://www.look.com.ua/summer/page/{}/', 119), # ('https://www.look.com.ua/newyear/page/{}/', 156), # ('https://www.look.com.ua/men/page/{}/', 157), # ('https://www.look.com.ua/holidays/page/{}/', 159), # ('https://www.look.com.ua/creative/page/{}/', 168), # ('https://www.look.com.ua/winter/page/{}/', 172), # ('https://www.look.com.ua/situation/page/{}/', 172), # ('https://www.look.com.ua/music/page/{}/', 184), # ('https://www.look.com.ua/food/page/{}/', 211), # ('https://www.look.com.ua/weapon/page/{}/', 217), # ('https://www.look.com.ua/aviation/page/{}/', 261), # ('https://www.look.com.ua/textures/page/{}/', 267), # ('https://www.look.com.ua/minimalism/page/{}/', 278), # ('https://www.look.com.ua/movies/page/{}/', 280), # ('https://www.look.com.ua/3d/page/{}/', 286), # ('https://www.look.com.ua/abstraction/page/{}/', 293), # ('https://www.look.com.ua/space/page/{}/', 302), # ('https://www.look.com.ua/sport/page/{}/', 307), # ('https://www.look.com.ua/mood/page/{}/', 422), # ('https://www.look.com.ua/flowers/page/{}/', 595), # ('https://www.look.com.ua/macro/page/{}/', 636), # ('https://www.look.com.ua/travel/page/{}/', 674), # ('https://www.look.com.ua/fantasy/page/{}/', 687), # ('https://www.look.com.ua/anime/page/{}/', 694), # ('https://www.look.com.ua/games/page/{}/', 720), # ('https://www.look.com.ua/other/page/{}/', 778), # ('https://www.look.com.ua/animals/page/{}/', 1103), # ('https://www.look.com.ua/landscape/page/{}/', 1140), # ('https://www.look.com.ua/nature/page/{}/', 1142), # ('https://www.look.com.ua/auto/page/{}/', 1559), # ('https://www.look.com.ua/girls/page/{}/', 9266), ] # loop = asyncio.get_event_loop() loop = ProactorEventLoop() asyncio.set_event_loop(loop) crawler = Crawler(max_tries=5, max_tasks=30) for info in source_urls: for i in range(1, info[1] + 1): json_data = { 'url': info[0].format(i), } crawler.insert_task(crawler.start_page_key, json.dumps(json_data)) try: loop.run_until_complete(crawler.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() logging.warning('\nInterrupted\n') finally: loop.run_until_complete(crawler.close()) # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()