def test_filesystem_event_log_storage_run_corrupted_bad_data(): with tempfile.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) SqlEventLogStorageMetadata.create_all( create_engine(storage.conn_string_for_shard("foo"))) with storage.run_connection("foo") as conn: event_insert = ( SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id="foo", event="{bar}", dagster_event_type=None, timestamp=None)) conn.execute(event_insert) with pytest.raises(DagsterEventLogInvalidForRun): storage.get_logs_for_run("foo") SqlEventLogStorageMetadata.create_all( create_engine(storage.conn_string_for_shard("bar"))) with storage.run_connection("bar") as conn: event_insert = ( SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id="bar", event="3", dagster_event_type=None, timestamp=None)) conn.execute(event_insert) with pytest.raises(DagsterEventLogInvalidForRun): storage.get_logs_for_run("bar")
def test_filesystem_event_log_storage_run_corrupted_bad_data(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) SqlEventLogStorageMetadata.create_all( create_engine(storage.conn_string_for_run_id('foo'))) with storage.connect('foo') as conn: event_insert = SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id='foo', event='{bar}', dagster_event_type=None, timestamp=None) conn.execute(event_insert) with pytest.raises(DagsterEventLogInvalidForRun): storage.get_logs_for_run('foo') SqlEventLogStorageMetadata.create_all( create_engine(storage.conn_string_for_run_id('bar'))) with storage.connect('bar') as conn: # pylint: disable=protected-access event_insert = SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id='bar', event='3', dagster_event_type=None, timestamp=None) conn.execute(event_insert) with pytest.raises(DagsterEventLogInvalidForRun): storage.get_logs_for_run('bar')
def test_fs_stores(): @pipeline def simple(): @solid def easy(context): context.log.info('easy') return 'easy' easy() with seven.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, ) result = execute_pipeline(simple, instance=instance) assert run_store.has_run(result.run_id) assert run_store.get_run_by_id(result.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_store.get_logs_for_run(result.run_id) if event.is_dagster_event ] stats = event_store.get_stats_for_run(result.run_id) assert stats.steps_succeeded == 1 assert stats.end_time is not None
def test_filesystem_event_log_storage_run_corrupted(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) with open(storage.filepath_for_run_id('foo'), 'w') as fd: fd.write('some nonsense') with pytest.raises(EventLogInvalidForRun) as exc: storage.get_logs_for_run('foo') assert exc.value.run_id == 'foo'
def cmd(self, exceptions, tmpdir_path): storage = SqliteEventLogStorage(tmpdir_path) try: storage.get_logs_for_run_by_log_id("foo") except Exception as exc: # pylint: disable=broad-except exceptions.put(exc) exc_info = sys.exc_info() traceback.print_tb(exc_info[2])
def test_filesystem_event_log_storage_run_corrupted(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) # URL begins sqlite:/// # pylint: disable=protected-access with open(os.path.abspath(storage.conn_string_for_run_id('foo')[10:]), 'w') as fd: fd.write('some nonsense') with pytest.raises(sqlalchemy.exc.DatabaseError): storage.get_logs_for_run('foo')
def cmd(exceptions, tmpdir_path): storage = SqliteEventLogStorage(tmpdir_path) try: with storage.connect('foo'): pass except Exception as exc: # pylint: disable=broad-except exceptions.put(exc) exc_info = sys.exc_info() traceback.print_tb(exc_info[2])
def event_log_storage(self): # pylint: disable=arguments-differ # make the temp dir in the cwd since default temp roots # have issues with FS notif based event log watching with tempfile.TemporaryDirectory(dir=os.getcwd()) as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) try: yield storage finally: storage.dispose()
def test_fs_stores(): @pipeline def simple(): @solid def easy(context): context.log.info("easy") return "easy" easy() with tempfile.TemporaryDirectory() as temp_dir: with environ({"DAGSTER_HOME": temp_dir}): run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) compute_log_manager = LocalComputeLogManager(temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=compute_log_manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), settings={"telemetry": { "enabled": False }}, ) result = execute_pipeline(simple, instance=instance) assert run_store.has_run(result.run_id) assert run_store.get_run_by_id( result.run_id).status == PipelineRunStatus.SUCCESS assert DagsterEventType.PIPELINE_SUCCESS in [ event.dagster_event.event_type for event in event_store.get_logs_for_run(result.run_id) if event.is_dagster_event ] stats = event_store.get_stats_for_run(result.run_id) assert stats.steps_succeeded == 1 assert stats.end_time is not None
def test_filesystem_event_log_storage_run_corrupted_bad_data(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) with storage._connect('foo') as conn: # pylint: disable=protected-access conn.cursor().execute(CREATE_EVENT_LOG_SQL) conn.cursor().execute(INSERT_EVENT_SQL, ('{bar}', None, None)) with pytest.raises(EventLogInvalidForRun) as exc: storage.get_logs_for_run('foo') assert exc.value.run_id == 'foo' with storage._connect('bar') as conn: # pylint: disable=protected-access conn.cursor().execute(CREATE_EVENT_LOG_SQL) conn.cursor().execute(INSERT_EVENT_SQL, ('3', None, None)) with pytest.raises(EventLogInvalidForRun) as exc: storage.get_logs_for_run('bar') assert exc.value.run_id == 'bar'
def broken_compute_log_manager_instance(fail_on_setup=False, fail_on_teardown=False): with tempfile.TemporaryDirectory() as temp_dir: with environ({"DAGSTER_HOME": temp_dir}): yield DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=SqliteRunStorage.from_local(temp_dir), event_storage=SqliteEventLogStorage(temp_dir), compute_log_manager=BrokenComputeLogManager( fail_on_setup=fail_on_setup, fail_on_teardown=fail_on_teardown ), run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), )
def test_filesystem_event_log_storage_store_events_and_wipe(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) assert len(storage.get_logs_for_run('foo')) == 0 storage.store_event( DagsterEventRecord( None, 'Message2', 'debug', '', 'foo', time.time(), dagster_event=DagsterEvent( DagsterEventType.ENGINE_EVENT.value, 'nonce', event_specific_data=EngineEventData.in_process(999), ), )) assert len(storage.get_logs_for_run('foo')) == 1 storage.wipe() assert len(storage.get_logs_for_run('foo')) == 0
def test_compute_log_manager_skip_empty_upload(mock_s3_bucket): @op def easy(context): context.log.info("easy") @job def simple(): easy() with tempfile.TemporaryDirectory() as temp_dir: with environ({"DAGSTER_HOME": temp_dir}): run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) PREFIX = "my_prefix" manager = S3ComputeLogManager(bucket=mock_s3_bucket.name, prefix=PREFIX, skip_empty_files=True) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), ) result = simple.execute_in_process(instance=instance) stderr_object = mock_s3_bucket.Object( key=f"{PREFIX}/storage/{result.run_id}/compute_logs/easy.err" ).get() assert stderr_object with pytest.raises(ClientError): # stdout is not uploaded because we do not print anything to stdout mock_s3_bucket.Object( key=f"{PREFIX}/storage/{result.run_id}/compute_logs/easy.out" ).get()
def test_compute_log_manager_with_envvar(gcs_bucket): @job def simple(): @op def easy(context): context.log.info("easy") print(HELLO_WORLD) # pylint: disable=print-call return "easy" easy() with open(os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")) as f: with tempfile.TemporaryDirectory() as temp_dir: with environ({"ENV_VAR": f.read(), "DAGSTER_HOME": temp_dir}): run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) manager = GCSComputeLogManager( bucket=gcs_bucket, prefix="my_prefix", local_dir=temp_dir, json_credentials_envvar="ENV_VAR", ) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), ) result = simple.execute_in_process(instance=instance) compute_steps = [ event.step_key for event in result.all_node_events if event.event_type == DagsterEventType.STEP_START ] assert len(compute_steps) == 1 step_key = compute_steps[0] stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data # Check GCS directly stderr_gcs = ( storage.Client() .bucket(gcs_bucket) .blob(f"my_prefix/storage/{result.run_id}/compute_logs/easy.err") .download_as_bytes() .decode("utf-8") ) for expected in EXPECTED_LOGS: assert expected in stderr_gcs # Check download behavior by deleting locally cached logs compute_logs_dir = os.path.join(temp_dir, result.run_id, "compute_logs") for filename in os.listdir(compute_logs_dir): os.unlink(os.path.join(compute_logs_dir, filename)) stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data
def event_log_storage(self): # pylint: disable=arguments-differ with tempfile.TemporaryDirectory() as tmpdir_path: yield SqliteEventLogStorage(tmpdir_path)
def test_compute_log_manager(mock_s3_bucket): @pipeline def simple(): @solid def easy(context): context.log.info("easy") print(HELLO_WORLD) # pylint: disable=print-call return "easy" easy() with seven.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) manager = S3ComputeLogManager( bucket=mock_s3_bucket.name, prefix="my_prefix", local_dir=temp_dir ) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ) result = execute_pipeline(simple, instance=instance) compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] assert len(compute_steps) == 1 step_key = compute_steps[0] stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data # Check S3 directly s3_object = mock_s3_bucket.Object( key="{prefix}/storage/{run_id}/compute_logs/easy.err".format( prefix="my_prefix", run_id=result.run_id ), ) stderr_s3 = six.ensure_str(s3_object.get()["Body"].read()) for expected in EXPECTED_LOGS: assert expected in stderr_s3 # Check download behavior by deleting locally cached logs compute_logs_dir = os.path.join(temp_dir, result.run_id, "compute_logs") for filename in os.listdir(compute_logs_dir): os.unlink(os.path.join(compute_logs_dir, filename)) stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data
def create_sqlite_run_event_logstorage(): with seven.TemporaryDirectory() as tmpdir_path: yield SqliteEventLogStorage(tmpdir_path)
def test_compute_log_manager(s3_bucket): @pipeline def simple(): @solid def easy(context): context.log.info('easy') print(HELLO_WORLD) return 'easy' easy() # Uses mock S3 s3 = boto3.client('s3') s3.create_bucket(Bucket=s3_bucket) with seven.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) manager = S3ComputeLogManager(bucket=s3_bucket, prefix='my_prefix', local_dir=temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_launcher=CliApiRunLauncher(), ) result = execute_pipeline(simple, instance=instance) compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] assert len(compute_steps) == 1 step_key = compute_steps[0] stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data # Check S3 directly s3_object = s3.get_object( Bucket=s3_bucket, Key='{prefix}/storage/{run_id}/compute_logs/easy.compute.err'. format(prefix='my_prefix', run_id=result.run_id), ) stderr_s3 = six.ensure_str(s3_object['Body'].read()) for expected in EXPECTED_LOGS: assert expected in stderr_s3 # Check download behavior by deleting locally cached logs compute_logs_dir = os.path.join(temp_dir, result.run_id, 'compute_logs') for filename in os.listdir(compute_logs_dir): os.unlink(os.path.join(compute_logs_dir, filename)) stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data
def test_compute_log_manager(mock_s3_bucket): @op def easy(context): context.log.info("easy") print(HELLO_WORLD) # pylint: disable=print-call return "easy" @job def simple(): easy() with tempfile.TemporaryDirectory() as temp_dir: with environ({"DAGSTER_HOME": temp_dir}): run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) manager = S3ComputeLogManager(bucket=mock_s3_bucket.name, prefix="my_prefix", local_dir=temp_dir) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_coordinator=DefaultRunCoordinator(), run_launcher=DefaultRunLauncher(), ref=InstanceRef.from_dir(temp_dir), ) result = simple.execute_in_process(instance=instance) compute_steps = [ event.step_key for event in result.all_node_events if event.event_type == DagsterEventType.STEP_START ] assert len(compute_steps) == 1 step_key = compute_steps[0] stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data # Check S3 directly s3_object = mock_s3_bucket.Object( key=f"my_prefix/storage/{result.run_id}/compute_logs/easy.err") stderr_s3 = s3_object.get()["Body"].read().decode("utf-8") for expected in EXPECTED_LOGS: assert expected in stderr_s3 # Check download behavior by deleting locally cached logs compute_logs_dir = os.path.join(temp_dir, result.run_id, "compute_logs") for filename in os.listdir(compute_logs_dir): os.unlink(os.path.join(compute_logs_dir, filename)) stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data
def test_filesystem_event_log_storage_run_not_found(): with seven.TemporaryDirectory() as tmpdir_path: storage = SqliteEventLogStorage(tmpdir_path) assert storage.get_logs_for_run('bar') == []
def create_default_sqlite_event_log_instance(): with tempfile.TemporaryDirectory() as temp_dir: asset_storage = SqliteEventLogStorage(temp_dir) instance = get_instance(temp_dir, asset_storage) yield [instance, asset_storage]
def test_compute_log_manager( mock_create_blob_client, mock_generate_blob_sas, storage_account, container, credential ): mock_generate_blob_sas.return_value = "fake-url" fake_client = FakeBlobServiceClient(storage_account) mock_create_blob_client.return_value = fake_client @pipeline def simple(): @solid def easy(context): context.log.info("easy") print(HELLO_WORLD) # pylint: disable=print-call return "easy" easy() with tempfile.TemporaryDirectory() as temp_dir: run_store = SqliteRunStorage.from_local(temp_dir) event_store = SqliteEventLogStorage(temp_dir) manager = AzureBlobComputeLogManager( storage_account=storage_account, container=container, prefix="my_prefix", local_dir=temp_dir, secret_key=credential, ) instance = DagsterInstance( instance_type=InstanceType.PERSISTENT, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=run_store, event_storage=event_store, compute_log_manager=manager, run_coordinator=DefaultRunCoordinator(), run_launcher=SyncInMemoryRunLauncher(), ) result = execute_pipeline(simple, instance=instance) compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] assert len(compute_steps) == 1 step_key = compute_steps[0] stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data # Check ADLS2 directly adls2_object = fake_client.get_blob_client( container=container, blob="{prefix}/storage/{run_id}/compute_logs/easy.err".format( prefix="my_prefix", run_id=result.run_id ), ) adls2_stderr = adls2_object.download_blob().readall().decode("utf-8") for expected in EXPECTED_LOGS: assert expected in adls2_stderr # Check download behavior by deleting locally cached logs compute_logs_dir = os.path.join(temp_dir, result.run_id, "compute_logs") for filename in os.listdir(compute_logs_dir): os.unlink(os.path.join(compute_logs_dir, filename)) stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) assert stdout.data == HELLO_WORLD + SEPARATOR stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) for expected in EXPECTED_LOGS: assert expected in stderr.data