def filter_logs(log_file): """Extract valid downlods from logs and return IPs.""" log.debug('Filtering logs in {}'.format(log_file)) with log_file.open() as fh: ip_counter = Counter() for line in fh: try: _, timestamp, ip, status_code, request = line.strip().split() except ValueError: # wrong format continue if status_code != conf.LOG_HTTP_STATUS: log.debug( 'Not the status code we want: {}'.format(status_code)) continue if not firefox_re.search(request): log.debug( 'Not the Firefox we\'re looking for: {}'.format(request)) continue if ip_counter[ip] >= conf.IP_RATE_LIMIT_MAX: log.info('Skipped {} due to rate limit'.format(ip)) statsd.incr('bart.ratelimit') continue ip_counter[ip] += 1 yield ip
def filter_logs(log_file): """Extract valid downlods from logs and return IPs.""" log.debug('Filtering logs in {}'.format(log_file)) with log_file.open() as fh: ip_counter = Counter() for line in fh: try: _, timestamp, ip, status_code, request = line.strip().split() except ValueError: # wrong format continue if status_code != conf.LOG_HTTP_STATUS: log.debug('Not the status code we want: {}'.format(status_code)) continue if not firefox_re.search(request): log.debug('Not the Firefox we\'re looking for: {}'.format(request)) continue if ip_counter[ip] >= conf.IP_RATE_LIMIT_MAX: log.info('Skipped {} due to rate limit'.format(ip)) statsd.incr('bart.ratelimit') continue ip_counter[ip] += 1 yield ip
def rate_limit_ip(ip): """Return boolean whether the IP is rate limited""" calls = rate_limiter.get(ip, 0) if calls: if calls >= conf.IP_RATE_LIMIT_MAX: log.debug('Rate limited {}'.format(ip)) statsd.incr('lisa.ratelimit', 0.5) return True rate_limiter.put(ip, calls + 1) return False
def main(): counter = 0 timer = statsd.timer('lisa.process_ip', rate=0.01) # 1% sample rate while True: if KILLED: log.info('Shutdown successful') return 0 try: ip_info = redis.brpop(rkeys.IPLOGS) except RedisError as e: log.error('Error with Redis: {}'.format(e)) return 1 # don't start above redis call as it will block to wait timer.start() log.debug('Got log data: ' + ip_info[1]) try: rtype, ip = ip_info[1].split(',') except ValueError: continue timestamp = get_epoch_minute() if rate_limit_ip(ip, timestamp): continue record = geo.get(ip) if record: # everything goes for total count and map process_map(record, timestamp) # only shares get more processing if rtype != data_types.DOWNLOAD: process_share(record, rtype) timer.stop() statsd.incr('lisa.process_ip', rate=0.01) # 1% sample rate if args.verbose: sys.stdout.write('.') sys.stdout.flush() # using a counter and if statement here instead of the # `rate` param on the gauge to avoid getting the length # of the Redis list every time. counter += 1 if counter >= 1000: counter = 0 statsd.gauge('queue.geoip', redis.llen(rkeys.IPLOGS))
def throw_at_lisa(log_file): """Put IPs on a queue in redis for Lisa to process.""" log.debug('Throwing {} at Lisa'.format(log_file)) if not log_file.exists(): raise IOError('Log file not found: {}'.format(log_file)) count = 0 pipe = redis.pipeline() for ip in filter_logs(log_file): pipe.lpush(rkeys.IPLOGS, '0,' + ip) count += 1 pipe.execute() statsd.incr('bart.ips_processed', count)
def rate_limit_ip(ip, timestamp): """Return boolean whether the IP is rate limited""" key = 'ratelimit:{}:{}'.format(ip, timestamp) current = int(redis.get(key) or 0) if current >= conf.IP_RATE_LIMIT_MAX: log.warning('Rate limited {}'.format(ip)) statsd.incr('lisa.ratelimit') return True pipe = redis.pipeline() pipe.incr(key).expire(key, 60) pipe.execute() return False
def main(): global counter timer = statsd.timer('lisa.process_ip', rate=0.01) # 1% sample rate pipe = redis.pipeline() while True: if KILLED: pipe.execute() log.info('Shutdown successful') return 0 try: if args.benchmark: ip_info = redis.rpop(rkeys.IPLOGS) else: ip_info = redis.brpop(rkeys.IPLOGS)[1] except RedisError as e: log.error('Error with Redis: {}'.format(e)) pipe.execute() return 1 if ip_info is None: # benchmark run is over pipe.execute() return 0 # don't start above redis call as it will block to wait timer.start() log.debug('Got log data: ' + ip_info) try: rtype, ip = ip_info.split(',') except ValueError: continue timestamp = get_epoch_minute() if rate_limit_ip(ip): continue record = geo.get(ip) if record: # everything goes for total count and map process_map(record, timestamp, pipe) # only shares get more processing if rtype != data_types.DOWNLOAD: process_share(record, rtype, pipe) timer.stop() statsd.incr('lisa.process_ip', rate=0.01) # 1% sample rate if args.verbose: sys.stdout.write('.') sys.stdout.flush() # using a counter and if statement here instead of the # `rate` param on the gauge to avoid getting the length # of the Redis list every time. counter += 1 if args.benchmark: if not counter % 1000: pipe.execute() else: if counter >= 1000: pipe.execute() counter = 0 statsd.gauge('queue.geoip', redis.llen(rkeys.IPLOGS))