Example #1
0
    def test_classify(self):
        for ip in (' 66.249.66.145', ' 66.249.66.143', ' 66.249.66.149', ' 66.249.66.147'):
            for ua in ('Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
                       'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'):
                assert crawlers.evaluate(ip, ua) == crawlers.VERIFIED_BOT

        for ip in ('128.101.175.19',):
            for ua in ('Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
                       'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'):
                assert crawlers.evaluate(ip, ua) == crawlers.POTENTIAL_MALICIOUS_BOT
Example #2
0
def header_whitelist():
    """
    If the request has an access token stored in the session, it means that the
    request sent a session cookie in the headers stored as requested by a previous
    request that was answered by ADS Core. The client is well behaving, storing
    cookies set by ADS Core and thus, we do not want to rate limit them.

    Rate limits are only to protect us from bootstrapping thousands of access
    tokens in the database.
    """
    if 'auth' in session:
        return True
    else:
        user_agent = request.headers.get('User-Agent')
        remote_ip = get_remote_address()
        #### For testing purposes:
        #user_agent = "Googlebot"
        #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com.
        #user_agent = "DuckDuckBot"
        #remote_ip = "50.16.241.117"
        #remote_ip = "127.0.0.1"
        evaluation = crawlers.evaluate(remote_ip, user_agent)

        if evaluation in (crawlers.VERIFIED_BOT, crawlers.UNVERIFIABLE_BOT, crawlers.POTENTIAL_MALICIOUS_BOT):
            return True
    return False
Example #3
0
def ratelimit_handler(e):
    user_agent = request.headers.get('User-Agent')
    remote_ip = get_remote_address()
    evaluation = crawlers.evaluate(remote_ip, user_agent)
    if evaluation == crawlers.VERIFIED_BOT:
        app.logger.info("Rate limited a request classified as 'VERIFIED_BOT'")
    elif evaluation == crawlers.UNVERIFIABLE_BOT:
        app.logger.info(
            "Rate limited a request classified as 'UNVERIFIABLE_BOT'")
    elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT:
        app.logger.info(
            "Rate limited a request classified as 'POTENTIAL_MALICIOUS_BOT'")
    elif evaluation == crawlers.POTENTIAL_USER:
        app.logger.info(
            "Rate limited a request classified as 'POTENTIAL_USER'")
    else:
        # None
        app.logger.info("Rate limited a request not classified: '%s' - '%s'",
                        remote_ip, user_agent)
    form = ModernForm()
    return render_template('429.html',
                           environment=current_app.config['ENVIRONMENT'],
                           base_url=app.config['SERVER_BASE_URL'],
                           request_path=request.path[1:],
                           form=form,
                           code=429), 429
Example #4
0
def ratelimit_handler(e):
    if e.description.endswith('per 1 day'):
        # ADS Core limit hit (to limit too many bootstraps)
        remote_ip = get_remote_address()
        description = "We have received too many requests from your IP ({}).".format(
            remote_ip)
    else:
        # API ratelimit hit
        description = e.description
    user_agent = request.headers.get('User-Agent')
    remote_ip = get_remote_address()
    evaluation = crawlers.evaluate(remote_ip, user_agent)
    if evaluation == crawlers.VERIFIED_BOT:
        app.logger.info("Rate limited a request classified as 'VERIFIED_BOT'")
    elif evaluation == crawlers.UNVERIFIABLE_BOT:
        app.logger.info(
            "Rate limited a request classified as 'UNVERIFIABLE_BOT'")
    elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT:
        app.logger.info(
            "Rate limited a request classified as 'POTENTIAL_MALICIOUS_BOT'")
    elif evaluation == crawlers.POTENTIAL_USER:
        app.logger.info(
            "Rate limited a request classified as 'POTENTIAL_USER'")
    else:
        # None
        app.logger.info("Rate limited a request not classified: '%s' - '%s'",
                        remote_ip, user_agent)
    form = ModernForm()
    return render_template('429.html',
                           environment=current_app.config['ENVIRONMENT'],
                           base_url=app.config['SERVER_BASE_URL'],
                           request_path=request.path[1:],
                           form=form,
                           code=429,
                           description=description), 429
Example #5
0
def before_request():
    """
    Store API anonymous cookie in session or if it exists, check if it has expired
    """
    if request.path in ('/ready', '/alive'):
        # Do not bootstrap readiness/liveness probes
        return
    g.request_start_time = time.time()
    g.request_time = lambda: "{:.3f}s".format(
        (time.time() - g.request_start_time))
    if 'cookies' not in session:
        session['cookies'] = {}
    if request.cookies.get('session'):
        # Re-use BBB session, if it is valid, the same BBB token will be returned by bootstrap
        # thus if the user was authenticated, it will use the user token
        session['cookies']['session'] = request.cookies.get('session')
    if 'auth' not in session or is_expired(session['auth']):
        user_agent = request.headers.get('User-Agent')
        remote_ip = get_remote_address()
        #### For testing purposes:
        #user_agent = "Googlebot"
        #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com.
        #user_agent = "DuckDuckBot"
        #remote_ip = "50.16.241.117"
        #remote_ip = "127.0.0.1"
        evaluation = crawlers.evaluate(remote_ip, user_agent)
        if evaluation == crawlers.VERIFIED_BOT:
            # Extremely high rate limit
            session['auth'] = {
                'access_token': app.config['VERIFIED_BOTS_ACCESS_TOKEN'],
                'expire_in': "2050-01-01T00:00:00",
                'bot': True
            }
        elif evaluation == crawlers.UNVERIFIABLE_BOT:
            # Slightly higher rate limit
            session['auth'] = {
                'access_token': app.config['UNVERIFIABLE_BOTS_ACCESS_TOKEN'],
                'expire_in': "2050-01-01T00:00:00",
                'bot': True
            }
        elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT:
            # Rate limits as a regular user with the advantage that there is no bootstrap
            session['auth'] = {
                'access_token': app.config['MALICIOUS_BOTS_ACCESS_TOKEN'],
                'expire_in': "2050-01-01T00:00:00",
                'bot': True
            }
        else:
            session['auth'] = api.bootstrap()
Example #6
0
def before_request():
    """
    Store API anonymous cookie in session or if it exists, check if it has expired
    """
    if request.path in ('/ready', '/alive'):
        # Do not bootstrap readiness/liveness probes
        return
    g.request_start_time = time.time()
    g.request_time = lambda: "{:.3f}s".format((time.time() - g.request_start_time))
    if 'cookies' not in session:
        session['cookies'] = {}

    if 'auth' not in session or is_expired(session['auth']):
        user_agent = request.headers.get('User-Agent')
        remote_ip = get_remote_address()
        #### For testing purposes:
        #user_agent = "Googlebot"
        #remote_ip = "66.249.66.1" # crawl-66-249-66-1.googlebot.com.
        #user_agent = "DuckDuckBot"
        #remote_ip = "50.16.241.117"
        #remote_ip = "127.0.0.1"
        evaluation = crawlers.evaluate(remote_ip, user_agent)
        if evaluation == crawlers.VERIFIED_BOT:
            # Extremely high rate limit
            RequestsManager.init(auth={'access_token': app.config['VERIFIED_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={})
        elif evaluation == crawlers.UNVERIFIABLE_BOT:
            # Slightly higher rate limit
            RequestsManager.init(auth={'access_token': app.config['UNVERIFIABLE_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={})
        elif evaluation == crawlers.POTENTIAL_MALICIOUS_BOT:
            # Rate limits as a regular user with the advantage that there is no bootstrap
            RequestsManager.init(auth={'access_token': app.config['MALICIOUS_BOTS_ACCESS_TOKEN'], 'expire_in': "2050-01-01T00:00:00", 'bot': True}, cookies={})

    if not RequestsManager.is_initialized():
        if request.cookies.get('session'):
            # - Re-use BBB session, if it is valid, the same BBB token will be returned by bootstrap
            # thus if the user was authenticated, it will use the user token
            # - Always bootstrap, otherwise the browser may end up logged in with different
            # users in BBB and core
            # - Ignore any previous bootstrapped access token
            RequestsManager.init(auth={}, cookies={'session': request.cookies.get('session')})
        elif 'auth' not in session:
            # No BBB or core session, API will bootstrap
            RequestsManager.init(auth={}, cookies={})
        else:
            # We have a core session and no BBB session, this is the only situation
            # API will not bootstrap
            RequestsManager.init(auth=session['auth'], cookies={})