def test_classify(self): for ip in (' 66.249.66.145', ' 66.249.66.143', ' 66.249.66.149', ' 66.249.66.147'): for ua in ('Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'): assert crawlers.evaluate(ip, ua) == crawlers.VERIFIED_BOT for ip in ('128.101.175.19',): for ua in ('Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'): assert crawlers.evaluate(ip, ua) == crawlers.POTENTIAL_MALICIOUS_BOT
def header_whitelist(): """ If the request has an access token stored in the session, it means that the request sent a session cookie in the headers stored as requested by a previous request that was answered by ADS Core. The client is well behaving, storing cookies set by ADS Core and thus, we do not want to rate limit them. Rate limits are only to protect us from bootstrapping thousands of access tokens in the database. """ if 'auth' in session: return True else: user_agent = request.headers.get('User-Agent') remote_ip = get_remote_address() #### For testing purposes: #user_agent = "Googlebot" #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com. #user_agent = "DuckDuckBot" #remote_ip = "50.16.241.117" #remote_ip = "127.0.0.1" evaluation = crawlers.evaluate(remote_ip, user_agent) if evaluation in (crawlers.VERIFIED_BOT, crawlers.UNVERIFIABLE_BOT, crawlers.POTENTIAL_MALICIOUS_BOT): return True return False
def ratelimit_handler(e): user_agent = request.headers.get('User-Agent') remote_ip = get_remote_address() evaluation = crawlers.evaluate(remote_ip, user_agent) if evaluation == crawlers.VERIFIED_BOT: app.logger.info("Rate limited a request classified as 'VERIFIED_BOT'") elif evaluation == crawlers.UNVERIFIABLE_BOT: app.logger.info( "Rate limited a request classified as 'UNVERIFIABLE_BOT'") elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT: app.logger.info( "Rate limited a request classified as 'POTENTIAL_MALICIOUS_BOT'") elif evaluation == crawlers.POTENTIAL_USER: app.logger.info( "Rate limited a request classified as 'POTENTIAL_USER'") else: # None app.logger.info("Rate limited a request not classified: '%s' - '%s'", remote_ip, user_agent) form = ModernForm() return render_template('429.html', environment=current_app.config['ENVIRONMENT'], base_url=app.config['SERVER_BASE_URL'], request_path=request.path[1:], form=form, code=429), 429
def ratelimit_handler(e): if e.description.endswith('per 1 day'): # ADS Core limit hit (to limit too many bootstraps) remote_ip = get_remote_address() description = "We have received too many requests from your IP ({}).".format( remote_ip) else: # API ratelimit hit description = e.description user_agent = request.headers.get('User-Agent') remote_ip = get_remote_address() evaluation = crawlers.evaluate(remote_ip, user_agent) if evaluation == crawlers.VERIFIED_BOT: app.logger.info("Rate limited a request classified as 'VERIFIED_BOT'") elif evaluation == crawlers.UNVERIFIABLE_BOT: app.logger.info( "Rate limited a request classified as 'UNVERIFIABLE_BOT'") elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT: app.logger.info( "Rate limited a request classified as 'POTENTIAL_MALICIOUS_BOT'") elif evaluation == crawlers.POTENTIAL_USER: app.logger.info( "Rate limited a request classified as 'POTENTIAL_USER'") else: # None app.logger.info("Rate limited a request not classified: '%s' - '%s'", remote_ip, user_agent) form = ModernForm() return render_template('429.html', environment=current_app.config['ENVIRONMENT'], base_url=app.config['SERVER_BASE_URL'], request_path=request.path[1:], form=form, code=429, description=description), 429
def before_request(): """ Store API anonymous cookie in session or if it exists, check if it has expired """ if request.path in ('/ready', '/alive'): # Do not bootstrap readiness/liveness probes return g.request_start_time = time.time() g.request_time = lambda: "{:.3f}s".format( (time.time() - g.request_start_time)) if 'cookies' not in session: session['cookies'] = {} if request.cookies.get('session'): # Re-use BBB session, if it is valid, the same BBB token will be returned by bootstrap # thus if the user was authenticated, it will use the user token session['cookies']['session'] = request.cookies.get('session') if 'auth' not in session or is_expired(session['auth']): user_agent = request.headers.get('User-Agent') remote_ip = get_remote_address() #### For testing purposes: #user_agent = "Googlebot" #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com. #user_agent = "DuckDuckBot" #remote_ip = "50.16.241.117" #remote_ip = "127.0.0.1" evaluation = crawlers.evaluate(remote_ip, user_agent) if evaluation == crawlers.VERIFIED_BOT: # Extremely high rate limit session['auth'] = { 'access_token': app.config['VERIFIED_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True } elif evaluation == crawlers.UNVERIFIABLE_BOT: # Slightly higher rate limit session['auth'] = { 'access_token': app.config['UNVERIFIABLE_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True } elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT: # Rate limits as a regular user with the advantage that there is no bootstrap session['auth'] = { 'access_token': app.config['MALICIOUS_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True } else: session['auth'] = api.bootstrap()
def before_request(): """ Store API anonymous cookie in session or if it exists, check if it has expired """ if request.path in ('/ready', '/alive'): # Do not bootstrap readiness/liveness probes return g.request_start_time = time.time() g.request_time = lambda: "{:.3f}s".format((time.time() - g.request_start_time)) if 'cookies' not in session: session['cookies'] = {} if 'auth' not in session or is_expired(session['auth']): user_agent = request.headers.get('User-Agent') remote_ip = get_remote_address() #### For testing purposes: #user_agent = "Googlebot" #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com. #user_agent = "DuckDuckBot" #remote_ip = "50.16.241.117" #remote_ip = "127.0.0.1" evaluation = crawlers.evaluate(remote_ip, user_agent) if evaluation == crawlers.VERIFIED_BOT: # Extremely high rate limit RequestsManager.init(auth={'access_token': app.config['VERIFIED_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={}) elif evaluation == crawlers.UNVERIFIABLE_BOT: # Slightly higher rate limit RequestsManager.init(auth={'access_token': app.config['UNVERIFIABLE_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={}) elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT: # Rate limits as a regular user with the advantage that there is no bootstrap RequestsManager.init(auth={'access_token': app.config['MALICIOUS_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={}) if not RequestsManager.is_initialized(): if request.cookies.get('session'): # - Re-use BBB session, if it is valid, the same BBB token will be returned by bootstrap # thus if the user was authenticated, it will use the user token # - Always bootstrap, otherwise the browser may end up logged in with different # users in BBB and core # - Ignore any previous bootstrapped access token RequestsManager.init(auth={}, cookies={'session': request.cookies.get('session')}) elif 'auth' not in session: # No BBB or core session, API will bootstrap RequestsManager.init(auth={}, cookies={}) else: # We have a core session and no BBB session, this is the only situation # API will not bootstrap RequestsManager.init(auth=session['auth'], cookies={})