def test_multiline_message(self): lines = StringIO( "2015-08-22 00:47:35,323 INFO org.apache.hadoop.mapreduce.Job" " (main): Counters: 54\r\n" " File System Counters\r\n" " FILE: Number of bytes read=83\r\n" ) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp="2015-08-22 00:47:35,323", level="INFO", logger="org.apache.hadoop.mapreduce.Job", thread="main", # strip \r's, no trailing \n message=( "Counters: 54\n" " File System Counters\n" " FILE: Number of bytes read=83" ), ) ], )
def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') with no_handlers_for_logger('mrjob.logs.parse'): stderr = StringIO() log_to_stream('mrjob.logs.parse', stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!')) ]) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def test_log_lines(self): lines = StringIO( "15/12/11 13:26:07 INFO client.RMProxy:" " Connecting to ResourceManager at /0.0.0.0:8032\n" "15/12/11 13:26:08 ERROR streaming.StreamJob:" " Error Launching job :" " Output directory already exists\n" ) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp="15/12/11 13:26:07", level="INFO", logger="client.RMProxy", thread=None, message="Connecting to ResourceManager at /0.0.0.0:8032", ), dict( timestamp="15/12/11 13:26:08", level="ERROR", logger="streaming.StreamJob", thread=None, message=("Error Launching job :" " Output directory already exists"), ), ], )
def test_non_log_lines(self): lines = StringIO( "foo\n" "bar\n" "15/12/11 13:26:08 ERROR streaming.StreamJob:" " Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ) with no_handlers_for_logger("mrjob.logs.parse"): stderr = StringIO() log_to_stream("mrjob.logs.parse", stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp="15/12/11 13:26:08", level="ERROR", logger="streaming.StreamJob", thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=( "Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ), ) ], ) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def test_trailing_carriage_return(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\r\n') self.assertEqual(list(_parse_hadoop_log_lines(lines)), [ dict(timestamp='15/12/11 13:26:07', level='INFO', logger='client.RMProxy', thread=None, message='Connecting to ResourceManager at /0.0.0.0:8032') ])
def test_thread(self): lines = StringIO( '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver' ' (main): Thread 1 created MetricsLockFreeSaver 1\n') self.assertEqual(list(_parse_hadoop_log_lines(lines)), [ dict(timestamp='2015-08-22 00:46:18,411', level='INFO', logger='amazon.emr.metrics.MetricsSaver', thread='main', message='Thread 1 created MetricsLockFreeSaver 1') ])
def test_trailing_carriage_return(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\r\n') self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp='15/12/11 13:26:07', level='INFO', logger='client.RMProxy', thread=None, message='Connecting to ResourceManager at /0.0.0.0:8032') ])
def test_trailing_carriage_return(self): lines = StringIO("15/12/11 13:26:07 INFO client.RMProxy:" " Connecting to ResourceManager at /0.0.0.0:8032\r\n") self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp="15/12/11 13:26:07", level="INFO", logger="client.RMProxy", thread=None, message="Connecting to ResourceManager at /0.0.0.0:8032", ) ], )
def test_thread(self): lines = StringIO( '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver' ' (main): Thread 1 created MetricsLockFreeSaver 1\n') self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp='2015-08-22 00:46:18,411', level='INFO', logger='amazon.emr.metrics.MetricsSaver', thread='main', message='Thread 1 created MetricsLockFreeSaver 1') ])
def test_thread(self): lines = StringIO( "2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver" " (main): Thread 1 created MetricsLockFreeSaver 1\n" ) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp="2015-08-22 00:46:18,411", level="INFO", logger="amazon.emr.metrics.MetricsSaver", thread="main", message="Thread 1 created MetricsLockFreeSaver 1", ) ], )
def test_log_lines(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n') self.assertEqual(list(_parse_hadoop_log_lines(lines)), [ dict(timestamp='15/12/11 13:26:07', level='INFO', logger='client.RMProxy', thread=None, message='Connecting to ResourceManager at /0.0.0.0:8032'), dict(timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, message=('Error Launching job :' ' Output directory already exists')) ])
def test_multiline_message(self): lines = StringIO( '2015-08-22 00:47:35,323 INFO org.apache.hadoop.mapreduce.Job' ' (main): Counters: 54\r\n' ' File System Counters\r\n' ' FILE: Number of bytes read=83\r\n') self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp='2015-08-22 00:47:35,323', level='INFO', logger='org.apache.hadoop.mapreduce.Job', thread='main', # strip \r's, no trailing \n message=('Counters: 54\n' ' File System Counters\n' ' FILE: Number of bytes read=83')) ])
def test_log_lines(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n') self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ dict( timestamp='15/12/11 13:26:07', level='INFO', logger='client.RMProxy', thread=None, message='Connecting to ResourceManager at /0.0.0.0:8032'), dict( timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, message=('Error Launching job :' ' Output directory already exists')) ])
def test_empty(self): self.assertEqual(list(_parse_hadoop_log_lines([])), [])