def setUp(self): super().setUp() self.local_log_location = 'local/log/location' self.filename_template = '{try_number}.log' self.log_id_template = '{dag_id}-{task_id}-{execution_date}-{try_number}' self.end_of_log_mark = 'end_of_log\n' self.write_stdout = False self.json_format = False self.json_fields = 'asctime,filename,lineno,levelname,message,exc_text' self.es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, ) self.es = elasticsearch.Elasticsearch( # pylint: disable=invalid-name hosts=[{'host': 'localhost', 'port': 9200}] ) self.index_name = 'test_index' self.doc_type = 'log' self.test_message = 'some random stuff' self.body = {'message': self.test_message, 'log_id': self.LOG_ID, 'offset': 1} self.es.index(index=self.index_name, doc_type=self.doc_type, body=self.body, id=1) self.dag = DAG(self.DAG_ID, start_date=self.EXECUTION_DATE) task = DummyOperator(task_id=self.TASK_ID, dag=self.dag) self.ti = TaskInstance(task=task, execution_date=self.EXECUTION_DATE) self.ti.try_number = 1 self.ti.state = State.RUNNING self.addCleanup(self.dag.clear)
def test_get_external_log_url(self, es_frontend, expected_url): es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, frontend=es_frontend, ) url = es_task_handler.get_external_log_url(self.ti, self.ti.try_number) assert expected_url == url
def test_render_log_id(self): expected_log_id = 'dag_for_testing_file_task_handler-' \ 'task_for_testing_file_log_handler-2016-01-01T00:00:00+00:00-1' log_id = self.es_task_handler._render_log_id(self.ti, 1) self.assertEqual(expected_log_id, log_id) # Switch to use jinja template. self.es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, '{{ ti.dag_id }}-{{ ti.task_id }}-{{ ts }}-{{ try_number }}', self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields ) log_id = self.es_task_handler._render_log_id(self.ti, 1) self.assertEqual(expected_log_id, log_id)
def test_client_with_config(self): es_conf = dict(conf.getsection("elasticsearch_configs")) expected_dict = { "use_ssl": False, "verify_certs": True, } assert es_conf == expected_dict # ensure creating with configs does not fail ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, es_kwargs=es_conf, )
class TestElasticsearchTaskHandler(unittest.TestCase): DAG_ID = 'dag_for_testing_file_task_handler' TASK_ID = 'task_for_testing_file_log_handler' EXECUTION_DATE = datetime(2016, 1, 1) LOG_ID = f'{DAG_ID}-{TASK_ID}-2016-01-01T00:00:00+00:00-1' @elasticmock def setUp(self): super().setUp() self.local_log_location = 'local/log/location' self.filename_template = '{try_number}.log' self.log_id_template = '{dag_id}-{task_id}-{execution_date}-{try_number}' self.end_of_log_mark = 'end_of_log\n' self.write_stdout = False self.json_format = False self.json_fields = 'asctime,filename,lineno,levelname,message,exc_text' self.es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, ) self.es = elasticsearch.Elasticsearch( # pylint: disable=invalid-name hosts=[{'host': 'localhost', 'port': 9200}] ) self.index_name = 'test_index' self.doc_type = 'log' self.test_message = 'some random stuff' self.body = {'message': self.test_message, 'log_id': self.LOG_ID, 'offset': 1} self.es.index(index=self.index_name, doc_type=self.doc_type, body=self.body, id=1) self.dag = DAG(self.DAG_ID, start_date=self.EXECUTION_DATE) task = DummyOperator(task_id=self.TASK_ID, dag=self.dag) self.ti = TaskInstance(task=task, execution_date=self.EXECUTION_DATE) self.ti.try_number = 1 self.ti.state = State.RUNNING self.addCleanup(self.dag.clear) def tearDown(self): shutil.rmtree(self.local_log_location.split(os.path.sep)[0], ignore_errors=True) def test_client(self): assert isinstance(self.es_task_handler.client, elasticsearch.Elasticsearch) def test_client_with_config(self): es_conf = dict(conf.getsection("elasticsearch_configs")) expected_dict = { "use_ssl": False, "verify_certs": True, } assert es_conf == expected_dict # ensure creating with configs does not fail ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, es_kwargs=es_conf, ) def test_read(self): ts = pendulum.now() logs, metadatas = self.es_task_handler.read( self.ti, 1, {'offset': 0, 'last_log_timestamp': str(ts), 'end_of_log': False} ) assert 1 == len(logs) assert len(logs) == len(metadatas) assert len(logs[0]) == 1 assert self.test_message == logs[0][0][-1] assert not metadatas[0]['end_of_log'] assert '1' == metadatas[0]['offset'] assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts def test_read_with_match_phrase_query(self): similar_log_id = '{task_id}-{dag_id}-2016-01-01T00:00:00+00:00-1'.format( dag_id=TestElasticsearchTaskHandler.DAG_ID, task_id=TestElasticsearchTaskHandler.TASK_ID ) another_test_message = 'another message' another_body = {'message': another_test_message, 'log_id': similar_log_id, 'offset': 1} self.es.index(index=self.index_name, doc_type=self.doc_type, body=another_body, id=1) ts = pendulum.now() logs, metadatas = self.es_task_handler.read( self.ti, 1, {'offset': '0', 'last_log_timestamp': str(ts), 'end_of_log': False, 'max_offset': 2} ) assert 1 == len(logs) assert len(logs) == len(metadatas) assert self.test_message == logs[0][0][-1] assert another_test_message != logs[0] assert not metadatas[0]['end_of_log'] assert '1' == metadatas[0]['offset'] assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts def test_read_with_none_metadata(self): logs, metadatas = self.es_task_handler.read(self.ti, 1) assert 1 == len(logs) assert len(logs) == len(metadatas) assert self.test_message == logs[0][0][-1] assert not metadatas[0]['end_of_log'] assert '1' == metadatas[0]['offset'] assert timezone.parse(metadatas[0]['last_log_timestamp']) < pendulum.now() def test_read_nonexistent_log(self): ts = pendulum.now() # In ElasticMock, search is going to return all documents with matching index # and doc_type regardless of match filters, so we delete the log entry instead # of making a new TaskInstance to query. self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1) logs, metadatas = self.es_task_handler.read( self.ti, 1, {'offset': 0, 'last_log_timestamp': str(ts), 'end_of_log': False} ) assert 1 == len(logs) assert len(logs) == len(metadatas) assert [[]] == logs assert not metadatas[0]['end_of_log'] assert '0' == metadatas[0]['offset'] # last_log_timestamp won't change if no log lines read. assert timezone.parse(metadatas[0]['last_log_timestamp']) == ts def test_read_with_empty_metadata(self): ts = pendulum.now() logs, metadatas = self.es_task_handler.read(self.ti, 1, {}) assert 1 == len(logs) assert len(logs) == len(metadatas) assert self.test_message == logs[0][0][-1] assert not metadatas[0]['end_of_log'] # offset should be initialized to 0 if not provided. assert '1' == metadatas[0]['offset'] # last_log_timestamp will be initialized using log reading time # if not last_log_timestamp is provided. assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts # case where offset is missing but metadata not empty. self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1) logs, metadatas = self.es_task_handler.read(self.ti, 1, {'end_of_log': False}) assert 1 == len(logs) assert len(logs) == len(metadatas) assert [[]] == logs assert not metadatas[0]['end_of_log'] # offset should be initialized to 0 if not provided. assert '0' == metadatas[0]['offset'] # last_log_timestamp will be initialized using log reading time # if not last_log_timestamp is provided. assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts def test_read_timeout(self): ts = pendulum.now().subtract(minutes=5) self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1) logs, metadatas = self.es_task_handler.read( self.ti, 1, {'offset': 0, 'last_log_timestamp': str(ts), 'end_of_log': False} ) assert 1 == len(logs) assert len(logs) == len(metadatas) assert [[]] == logs assert metadatas[0]['end_of_log'] # offset should be initialized to 0 if not provided. assert '0' == metadatas[0]['offset'] assert timezone.parse(metadatas[0]['last_log_timestamp']) == ts def test_read_as_download_logs(self): ts = pendulum.now() logs, metadatas = self.es_task_handler.read( self.ti, 1, {'offset': 0, 'last_log_timestamp': str(ts), 'download_logs': True, 'end_of_log': False}, ) assert 1 == len(logs) assert len(logs) == len(metadatas) assert len(logs[0]) == 1 assert self.test_message == logs[0][0][-1] assert not metadatas[0]['end_of_log'] assert metadatas[0]['download_logs'] assert '1' == metadatas[0]['offset'] assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts def test_read_raises(self): with mock.patch.object(self.es_task_handler.log, 'exception') as mock_exception: with mock.patch("elasticsearch_dsl.Search.execute") as mock_execute: mock_execute.side_effect = Exception('Failed to read') logs, metadatas = self.es_task_handler.read(self.ti, 1) assert mock_exception.call_count == 1 args, kwargs = mock_exception.call_args assert "Could not read log with log_id:" in args[0] assert 1 == len(logs) assert len(logs) == len(metadatas) assert [[]] == logs assert not metadatas[0]['end_of_log'] assert '0' == metadatas[0]['offset'] def test_set_context(self): self.es_task_handler.set_context(self.ti) assert self.es_task_handler.mark_end_on_close def test_set_context_w_json_format_and_write_stdout(self): formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.es_task_handler.formatter = formatter self.es_task_handler.write_stdout = True self.es_task_handler.json_format = True self.es_task_handler.set_context(self.ti) def test_read_with_json_format(self): ts = pendulum.now() formatter = logging.Formatter( '[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s - %(exc_text)s' ) self.es_task_handler.formatter = formatter self.es_task_handler.json_format = True self.body = { 'message': self.test_message, 'log_id': f'{self.DAG_ID}-{self.TASK_ID}-2016_01_01T00_00_00_000000-1', 'offset': 1, 'asctime': '2020-12-24 19:25:00,962', 'filename': 'taskinstance.py', 'lineno': 851, 'levelname': 'INFO', } self.es_task_handler.set_context(self.ti) self.es.index(index=self.index_name, doc_type=self.doc_type, body=self.body, id=id) logs, _ = self.es_task_handler.read( self.ti, 1, {'offset': 0, 'last_log_timestamp': str(ts), 'end_of_log': False} ) assert "[2020-12-24 19:25:00,962] {taskinstance.py:851} INFO - some random stuff - " == logs[0][0][1] def test_close(self): formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.es_task_handler.formatter = formatter self.es_task_handler.set_context(self.ti) self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: # end_of_log_mark may contain characters like '\n' which is needed to # have the log uploaded but will not be stored in elasticsearch. # so apply the strip() to log_file.read() log_line = log_file.read().strip() assert self.end_of_log_mark.strip() == log_line assert self.es_task_handler.closed def test_close_no_mark_end(self): self.ti.raw = True self.es_task_handler.set_context(self.ti) self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: assert self.end_of_log_mark not in log_file.read() assert self.es_task_handler.closed def test_close_closed(self): self.es_task_handler.closed = True self.es_task_handler.set_context(self.ti) self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: assert 0 == len(log_file.read()) def test_close_with_no_handler(self): self.es_task_handler.set_context(self.ti) self.es_task_handler.handler = None self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: assert 0 == len(log_file.read()) assert self.es_task_handler.closed def test_close_with_no_stream(self): self.es_task_handler.set_context(self.ti) self.es_task_handler.handler.stream = None self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: assert self.end_of_log_mark in log_file.read() assert self.es_task_handler.closed self.es_task_handler.set_context(self.ti) self.es_task_handler.handler.stream.close() self.es_task_handler.close() with open( os.path.join(self.local_log_location, self.filename_template.format(try_number=1)) ) as log_file: assert self.end_of_log_mark in log_file.read() assert self.es_task_handler.closed def test_render_log_id(self): expected_log_id = ( 'dag_for_testing_file_task_handler-' 'task_for_testing_file_log_handler-2016-01-01T00:00:00+00:00-1' ) log_id = self.es_task_handler._render_log_id(self.ti, 1) assert expected_log_id == log_id # Switch to use jinja template. self.es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, '{{ ti.dag_id }}-{{ ti.task_id }}-{{ ts }}-{{ try_number }}', self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, ) log_id = self.es_task_handler._render_log_id(self.ti, 1) assert expected_log_id == log_id def test_clean_execution_date(self): clean_execution_date = self.es_task_handler._clean_execution_date(datetime(2016, 7, 8, 9, 10, 11, 12)) assert '2016_07_08T09_10_11_000012' == clean_execution_date @parameterized.expand( [ # Common case ('localhost:5601/{log_id}', 'https://localhost:5601/' + quote(LOG_ID.replace('T', ' '))), # Ignore template if "{log_id}"" is missing in the URL ('localhost:5601', 'https://localhost:5601'), ] ) def test_get_external_log_url(self, es_frontend, expected_url): es_task_handler = ElasticsearchTaskHandler( self.local_log_location, self.filename_template, self.log_id_template, self.end_of_log_mark, self.write_stdout, self.json_format, self.json_fields, frontend=es_frontend, ) url = es_task_handler.get_external_log_url(self.ti, self.ti.try_number) assert expected_url == url