def main(multiplier=10): file_name = None operations = 0 with tempfile.NamedTemporaryFile(delete=False) as f: file_name = f.name for (log_line, values) in get_all_combinations(): for i in range(multiplier): operations += 1 f.write(log_line) start_time = time.time() with open(file_name) as f: try: parse(f) finally: os.unlink(file_name) total_time = time.time() - start_time print "Total time: %.2fs Operations/s: %.2f (total %d operations)" % ( total_time, operations / total_time, operations, )
def main(multiplier=10): file_name = None operations = 0 with tempfile.NamedTemporaryFile(delete=False) as f: file_name = f.name for (log_line, values) in get_all_combinations(): for i in range(multiplier): operations += 1 f.write(log_line) start_time = time.time() with open(file_name) as f: try: parse(f) finally: os.unlink(file_name) total_time = time.time() - start_time print "Total time: %.2fs Operations/s: %.2f (total %d operations)" % ( total_time, operations / total_time, operations, )
def test_can_get_status_code(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.status_code).to_equal('200') expect(item.aborted).to_be_false()
def test_can_get_request_id(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() expected_id = '9fSyxPBMgh0D7BFVPg1snTTm1Agq-Xcrq6gVZF_6vCfRr96WkBtiZQ==' item = result[0] expect(item.request_id).to_equal(expected_id)
def test_can_get_status_code(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.status_code).to_equal('200') expect(item.aborted).to_be_false()
def test_can_get_cookies(self): result = parse(COOKIE_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cookies).not_to_be_null() expect(item.cookies).to_equal('session-token="some-session"; x-main=some-main-string;')
def test_can_parse_line(self): result = parse(LOG_LINE) expect(result).not_to_be_null() expect(result).not_to_be_empty() expect(result[0]).to_be_instance_of(Response)
def transform(event, context): logging.info(json.dumps({'event': event})) correlation_id = get_correlation_id(event=event) bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) s3 = boto3.client('s3') s3_resource = boto3.resource('s3') response = s3.get_object(Bucket=bucket, Key=key) data = response['Body'].read() data = zlib.decompress(data, 16 + zlib.MAX_WBITS).decode('UTF-8') reader = geoip2.database.Reader('./GeoLite2-City.mmdb') parsed = cloudfront_log_parser.parse(data) output = io.StringIO() fieldnames = [ 'ip_address', 'day_of_week', 'hour_of_day', 'minute_of_hour', 'edge', 'response_size', 'http_method', 'cloudfront_host', 'path', 'status_code', 'status_code_group', 'aborted', 'referrer', 'user_agent', 'browser_family', 'browser_version', 'os_family', 'os_version', 'device', 'is_mobile', 'is_tablet', 'is_pc', 'is_touch_capable', 'is_bot', 'querystring', 'edge_result_type', 'request_host', 'request_protocol', 'request_size', 'response_duration', 'ssl_protocol', 'ssl_cypher', 'edge_response_result_type', 'country', 'city', 'latitude', 'longitude' ] writer = csv.DictWriter(output, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() return_object = [] logging.debug( json.dumps({ 'message': 'lines in log', 'length': len(parsed) })) pool = ThreadPool(3) return_object = pool.starmap(format_data, zip(parsed, itertools.repeat(reader))) pool.close() pool.join() reader.close() logging.debug( json.dumps({ 'message': 'multithreaded return object', 'object': '{}'.format(return_object) })) for row in return_object: logging.debug( json.dumps({ 'message': 'row in return_object', 'row': row })) writer.writerow(row) s3_resource.Object(os.environ['TRANSFORMED_BUCKET'], key.replace(".gz", ".csv")).put(Body=output.getvalue()) logging.info(json.dumps({'message': 'Done!'}))
def test_can_parse_line(self): result = parse(LOG_LINE) expect(result).not_to_be_null() expect(result).not_to_be_empty() expect(result[0]).to_be_instance_of(Response)
def test_can_get_aborted_request(self): result = parse(ABORTED_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.status_code).to_equal('000') expect(item.aborted).to_be_true()
def find_request_id(profile, bucket, request_id): # pick a file to inspect session = boto3.Session(profile_name=profile) s3_client = session.client('s3') files = s3_client.list_objects_v2(Bucket=bucket, Prefix='cloudfront-log') args = arguments.Args() logs = [k['Key'] for k in files['Contents']] logs.reverse() for log in logs: s3obj = s3_client.get_object(Bucket=bucket, Key=log) with gzip.open(s3obj['Body'], 'rt') as f: responses = parse(f.readlines()) for r in responses: if r.request_id == request_id: puts("Found!") puts( columns( [colored.green(r.http_method), 6], [colored.red(r.status_code), 6], [colored.red(r.edge_result_type), 10], [colored.yellow(r.request_id), 56], [colored.magenta(str(r.timestamp)), 20], [colored.blue(r.path), None], )) exit(0)
def test_can_get_aborted_request(self): result = parse(ABORTED_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.status_code).to_equal('000') expect(item.aborted).to_be_true()
def test_can_get_request_id(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() expected_id = '9fSyxPBMgh0D7BFVPg1snTTm1Agq-Xcrq6gVZF_6vCfRr96WkBtiZQ==' item = result[0] expect(item.request_id).to_equal(expected_id)
def test_can_get_cookies(self): result = parse(COOKIE_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cookies).not_to_be_null() expect(item.cookies).to_equal( 'session-token="some-session"; x-main=some-main-string;')
def test_can_get_information_on_edge(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.edge).to_be_true() expect(item.edge['city']).to_equal('Miami') expect(item.edge['number']).to_equal(50) expect(item.edge['reference']['latitude']).to_equal(25.7931995)
def test_can_get_information_on_edge(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.edge).to_be_true() expect(item.edge['city']).to_equal('Miami') expect(item.edge['number']).to_equal(50) expect(item.edge['reference']['latitude']).to_equal(25.7931995)
def test_can_get_datetime(self): result = parse(LOG_LINE) dt = datetime(year=2015, month=7, day=28, hour=11, minute=28, second=40) expect(result).not_to_be_empty() item = result[0] expect(item.timestamp).to_equal(dt)
def test_can_get_path(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() url = '/sample-user-id/gnQ93w5t5BwDe8Je7OUa/tOiP6Y_L1xKUIEfURwwiSIVprFA%253D/200x150/http%253A/extra.globo.com/' \ 'incoming/16823873-03c-cf8/w640h360-PROP/Romario.jpg' item = result[0] expect(item.path).to_equal(url)
def test_can_get_path(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() url = '/sample-user-id/gnQ93w5t5BwDe8Je7OUa/tOiP6Y_L1xKUIEfURwwiSIVprFA%253D/200x150/http%253A/extra.globo.com/' \ 'incoming/16823873-03c-cf8/w640h360-PROP/Romario.jpg' item = result[0] expect(item.path).to_equal(url)
def test_can_parse_file_like(self): items = """#Version: 1.0 #Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type %s %s""" % ( LOG_LINE, SSL_LOG_LINE, ) log_line = StringIO(items) result = parse(log_line) expect(result).not_to_be_empty() expect(result).to_length(2)
def test_can_parse_file_like(self): items = """#Version: 1.0 #Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type %s %s""" % ( LOG_LINE, SSL_LOG_LINE, ) log_line = StringIO(items) result = parse(log_line) expect(result).not_to_be_empty() expect(result).to_length(2)
def test_can_get_datetime(self): result = parse(LOG_LINE) dt = datetime(year=2015, month=7, day=28, hour=11, minute=28, second=40) expect(result).not_to_be_empty() item = result[0] expect(item.timestamp).to_equal(dt)
def test_can_parse_user_agent(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 ' \ 'Mobile/9B179 Safari/7534.48.3' item = result[0] expect(item.user_agent).to_equal(ua) expect(item.browser_family).to_equal('Mobile Safari') expect(item.browser_version).to_equal('5.1') expect(item.os_family).to_equal('iOS') expect(item.os_version).to_equal('5.1') expect(item.device).to_equal('iPhone') expect(item.is_mobile).to_be_true() expect(item.is_tablet).to_be_false() expect(item.is_pc).to_be_false() expect(item.is_touch_capable).to_be_true() expect(item.is_bot).to_be_false()
def test_can_parse_user_agent(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 ' \ 'Mobile/9B179 Safari/7534.48.3' item = result[0] expect(item.user_agent).to_equal(ua) expect(item.browser_family).to_equal('Mobile Safari') expect(item.browser_version).to_equal('5.1') expect(item.os_family).to_equal('iOS') expect(item.os_version).to_equal('5.1') expect(item.device).to_equal('iPhone') expect(item.is_mobile).to_be_true() expect(item.is_tablet).to_be_false() expect(item.is_pc).to_be_false() expect(item.is_touch_capable).to_be_true() expect(item.is_bot).to_be_false()
def opencfl_log(profile, bucket, distribution, prefix, year, month, day): session = boto3.Session(profile_name=profile) s3_client = session.client('s3') # create a prefix based on distribution, year and month prefix = prefix + '/' + distribution if year: prefix += '.' + str(year) if month: prefix += '-' + str(month) if day: prefix += '-' + str(day) files = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) options = [k['Key'] for k in files['Contents']] try: filename = prompt.options('Pick log to view', [o.split('/')[1] for o in options]) except KeyboardInterrupt: quit(0) s3obj = s3_client.get_object(Bucket=bucket, Key=options[filename-1]) with gzip.open(s3obj['Body'], 'rt') as f: responses = parse(f.readlines()) puts( columns( [colored.yellow('method'), 6], [colored.yellow('status'), 6], [colored.yellow('result'), 10], [colored.yellow('timestamp'), 20], [colored.yellow('request_id'), 56], [colored.yellow('path'), None], ) ) for r in responses: puts( columns( [colored.green(r.http_method), 6], [colored.red(r.status_code), 6], [colored.red(r.edge_result_type), 10], [colored.cyan(arrow.get(r.timestamp).to('US/Eastern').humanize()), 20], [colored.yellow(r.request_id), 56], [colored.blue(r.path), None], ) )
def test_can_get_cloudfront_host(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cloudfront_host).to_equal('d3n18mvc4wxsim.cloudfront.net')
def test_can_get_referrer(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.referrer).to_equal('http://facebook.com/')
def test_can_get_user_ip_using_c_ip(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ip_address).to_equal('179.34.7.52')
def test_can_get_edge_response_result(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.edge_response_result_type).to_equal(Response.Result.Miss)
def test_querystring_is_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.querystring).to_be_null()
def test_can_get_ip_from_x_forwarded_for(self): result = parse(XF_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ip_address).to_equal('179.34.7.54')
def test_ssl_cypher_is_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_cypher).to_be_null()
def test_can_get_ip_from_x_forwarded_for(self): result = parse(XF_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ip_address).to_equal('179.34.7.54')
def test_can_get_referrer(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.referrer).to_equal('http://facebook.com/')
def test_ssl_cypher_is_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_cypher).to_be_null()
def test_can_get_ssl_protocol(self): result = parse(SSL_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_protocol).to_equal('SSLv3')
def test_can_get_ssl_cypher(self): result = parse(SSL_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_cypher).to_equal('AES256-SHA')
def test_can_get_edge_response_result(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.edge_response_result_type).to_equal(Response.Result.Miss)
def test_can_get_http_method(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.http_method).to_equal('GET')
def test_can_get_user_ip_using_c_ip(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ip_address).to_equal('179.34.7.52')
def test_can_get_response_duration(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.response_duration).to_equal(0.086)
def test_can_get_response_duration(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.response_duration).to_equal(0.086)
def test_can_get_querystring(self): result = parse(QS_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.querystring).to_equal('a=1&b=2')
def test_can_get_ssl_protocol(self): result = parse(SSL_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_protocol).to_equal('SSLv3')
def test_querystring_is_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.querystring).to_be_null()
def test_can_get_ssl_cypher(self): result = parse(SSL_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.ssl_cypher).to_equal('AES256-SHA')
def test_can_get_request_size(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.request_size).to_equal(228)
def test_cookies_are_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cookies).to_be_null()
def test_cookies_are_null_when_hyphen(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cookies).to_be_null()
def test_can_get_querystring(self): result = parse(QS_LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.querystring).to_equal('a=1&b=2')
def test_can_get_request_protocol(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.request_protocol).to_equal('http')
def test_can_get_number_of_bytes_on_response(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.response_size).to_equal(12330)
def test_can_get_request_protocol(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.request_protocol).to_equal('http')
def test_can_get_http_method(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.http_method).to_equal('GET')
def test_can_get_request_size(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.request_size).to_equal(228)
def test_can_get_cloudfront_host(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.cloudfront_host).to_equal('d3n18mvc4wxsim.cloudfront.net')
def test_can_get_number_of_bytes_on_response(self): result = parse(LOG_LINE) expect(result).not_to_be_empty() item = result[0] expect(item.response_size).to_equal(12330)