def test_pull(run_test): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) api.apply(TEST_CONTEXT, RemoteTest) bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' api.delete_context(context_name=TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) api.pull(TEST_CONTEXT) pulled_bundles = api.search(TEST_CONTEXT) assert len(pulled_bundles) > 0, 'Pulled bundles down' assert pulled_bundles[0].data == 'Hello', 'Bundle contains correct data' bucket.objects.all().delete() bucket.delete()
def test_string_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='string_task', data='output') data = api.get(TEST_CONTEXT, 'string_task').data assert data == 'output', 'Data did not match output' assert type(data) == six.text_type, 'Data is not string' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_float_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='float_task', data=2.5) data = api.get(TEST_CONTEXT, 'float_task').data assert data == 2.5, 'Data did not match output' assert type(data) == float, 'Data is not float' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_list_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='list_task', data=[1, 2, 3]) data = api.get(TEST_CONTEXT, 'list_task').data assert np.array_equal(data, [1, 2, 3]), 'Data did not match output' assert type(data) == np.ndarray, 'Data is not list' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_dependant_tasks(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'C') data = api.get(TEST_CONTEXT, 'c').data assert data == 6, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present'
def test_int_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, IntTask) data = api.get(TEST_CONTEXT, 'int_task').data assert data == 1, 'Data did not match output' assert type(data) == int, 'Data is not int' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_dict_task(): setup() assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, DictTask) data = api.get(TEST_CONTEXT, 'dict_task').data assert data == {'hello': ['world']}, 'Data did not match output' assert type(data) == dict, 'Data is not dict' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_dict_task(): setup() assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' d = {'hello': ['world']} _ = api.Bundle(TEST_CONTEXT, name='dict_task', data=d) d = api.get(TEST_CONTEXT, 'dict_task').data assert d == {'hello': ['world']}, 'Data did not match output' assert type(d) == dict, 'Data is not dict' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_non_managed_local(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, NonManagedLocal) assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' print(api.cat(TEST_CONTEXT, 'b1')) assert os.path.exists(api.search(TEST_CONTEXT, human_name='b1')[0].data['file'][0]), \ 'Local file should be present in bundle'
def test_df_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' df = pd.DataFrame() df['a'] = [1, 2, 3] _ = api.Bundle(TEST_CONTEXT, name='df_task', data=df) data = api.get(TEST_CONTEXT, 'df_task').data assert df.equals(data), 'Data did not match output' assert type(data) == pd.DataFrame, 'Data is not df' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_file_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, FileTask) output_path = api.get(TEST_CONTEXT, 'file_task').data with open(output_path) as f: output = f.read() assert output == '5', 'Data did not match output' assert type(output_path) == str, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_single_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'A') data = api.get(TEST_CONTEXT, 'a').data assert data == 2, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' api.apply(TEST_CONTEXT, 'A') assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_push(run_test): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3', region_name='us-east-1') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) _ = api.Bundle(TEST_CONTEXT, name='remote_test', data='Hello') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' bucket.objects.all().delete() bucket.delete()
def test_remote_push_managed_s3(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) # Setup moto s3 resources s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) # Make sure bucket is empty objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' # Bind remote context api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) # Apply api.apply(TEST_CONTEXT, ManagedS3, incremental_push=True) assert not os.path.exists(api.search(TEST_CONTEXT, human_name='b4')[0].data['file'][0]), \ 'Managed S3 file should not be copied to local' # Get objects from remote objects = s3_client.list_objects(Bucket=TEST_BUCKET) keys = [o['Key'] for o in objects['Contents']] keys = [key.split('/')[-1] for key in keys] # Make sure files exist in S3 for output_file in ['test.parquet']: assert output_file in keys, 'Pipeline should have pushed file'
def test_bundle_push_delocalize(): """ Test Bundle.push(delocalize) Test if we can push individually, and see that the files actualize to s3 paths. """ s3_client = _setup() bundles = {} for i in range(3): name = "shark{}".format(i) bundles[name] = create_local_file_bundle(name) bundles[name].push(delocalize=True) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' # Check for delocalization for b in bundles.values(): for i, f in enumerate(b.data): assert f.startswith("s3://") # Might as well test the push and pull worked. api.rm(TEST_CONTEXT, rm_all=True) api.pull(TEST_CONTEXT, localize=True) found_bundles = {b.name: b for b in api.search(TEST_CONTEXT)} for n, b in found_bundles.items(): for i, f in enumerate(b.data): assert md5_file(f) == b.tags["f{}".format(i)] api.delete_context(TEST_CONTEXT)
def test_fast_push(): s3_client = _setup() bundles = {} for i in range(3): name = "shark{}".format(i) bundles[name] = create_local_file_bundle(name) bundles[name].push( ) # need this to put the bundles in the remote b/c of moto mp issues # Moto keeps s3 xfers in memory, so multiprocessing will succeed # But when the subprocess exits, the files will disappear # This is basically useless in a test. api.push(TEST_CONTEXT) # push and remote all data, then pull and localize. objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' api.rm(TEST_CONTEXT, rm_all=True) api.pull(TEST_CONTEXT, localize=True) found_bundles = {b.name: b for b in api.search(TEST_CONTEXT)} for n, b in found_bundles.items(): for i, f in enumerate(b.data): assert md5_file(f) == b.tags["f{}".format(i)] api.delete_context(TEST_CONTEXT)
def test_push(): api.context(context_name=TEST_CONTEXT) s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL, force=True) api.apply(TEST_CONTEXT, 'RemoteTest') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' bucket.objects.all().delete() bucket.delete() api.delete_context(context_name=TEST_CONTEXT)
def test_task_with_parameter(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'B', params={'n': 10}) data = api.get(TEST_CONTEXT, 'b').data assert data == 20, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' api.apply(TEST_CONTEXT, 'B', params={'n': 20}) data = api.get(TEST_CONTEXT, 'b').data assert data == 40, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 2, 'Two bundles should be present'
def test_remote_no_push_non_managed_s3(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) # Setup moto s3 resources s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) # Make sure bucket is empty objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) assert 'Contents' not in objects, 'Bucket should be empty' # Bind remote context api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) # Apply api.apply(TEST_CONTEXT, NonManagedS3) print(api.cat(TEST_CONTEXT, 'b2')) # Local context should not contain file if a remote exists. b = api.search(TEST_CONTEXT, human_name='b2')[0] assert not os.path.exists( b.data['file'] [0]), 'Non Managed S3 file w/ remote should be copied to remote' assert b.data['file'][0].startswith("s3://")
def test_independent_context(): context_1_name = '__test_context_1__' context_2_name = '__test_context_2__' api.context(context_1_name) api.context(context_2_name) api.apply(context_1_name, ContextTest) assert len(api.search(context_1_name)) == 1, 'Only one bundle should be in context one' assert len(api.search(context_2_name)) == 0, 'Context two should be empty' api.delete_context(context_name=context_1_name) api.delete_context(context_name=context_2_name) assert context_1_name not in api.ls_contexts(), 'Contexts should be removed' assert context_2_name not in api.ls_contexts(), 'Contexts should be removed'
def test_child_task_with_parameter(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'C', params={'n': 10}) data = api.get(TEST_CONTEXT, 'c').data assert data == 22, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present' api.apply(TEST_CONTEXT, 'C', params={'n': 20}) data = api.get(TEST_CONTEXT, 'c').data assert data == 42, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 5, 'Five bundles should be present'
def test_no_remote_no_push_managed_s3(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' with pytest.raises(Exception) as e: api.apply(TEST_CONTEXT, ManagedS3)
def test_file_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' with api.Bundle(TEST_CONTEXT, name='file_task') as b: f1 = b.get_file("test.txt") with open(f1, mode='w') as f: f.write('5') b.add_data(f1) output_path = api.get(TEST_CONTEXT, 'file_task').data with open(output_path) as f: output = f.read() assert output == '5', 'Data did not match output' assert type(output_path) == str, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_A2_A3(run_test): """ 2.) Run A, Run A, should re-use 3.) Run A, Run A*, should re-run """ result = api.apply(TEST_CONTEXT, A) assert result['did_work'] is True first_A_uuid = api.get(TEST_CONTEXT, 'A').uuid result = api.apply(TEST_CONTEXT, A) assert result['did_work'] is False second_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert first_A_uuid == second_A_uuid assert len(api.search(TEST_CONTEXT, 'A')) is 1 # Mod args, should re-run result = api.apply(TEST_CONTEXT, A, params={'a': 2, 'b': 3}) assert result['did_work'] is True next_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert next_A_uuid != second_A_uuid assert len(api.search(TEST_CONTEXT, 'A')) is 2
def test_no_remote_push_non_managed_s3(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) # Setup moto s3 resources s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) s3_resource.create_bucket(Bucket=TEST_BUCKET_OTHER) # Make sure bucket is empty objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' objects = s3_client.list_objects(Bucket=TEST_BUCKET_OTHER) assert 'Contents' not in objects, 'Bucket should be empty' api.apply(TEST_CONTEXT, NonManagedS3, incremental_push=True) print(api.cat(TEST_CONTEXT, 'b2')) assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' assert os.path.exists(api.search(TEST_CONTEXT, human_name='b2')[0].data['file'][0]), \ 'Non Managed S3 file should be copied to local'
def test_add_with_treat_as_bundle(): api.delete_context(TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) # Setup moto s3 resources s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) # Make sure bucket is empty objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' # Bind remote context api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL, force=True) # Run test pipeline api.apply(TEST_CONTEXT, CIP) # Push bundles to remote for bundle_name in ['a', 'b', 'c']: assert api.get(TEST_CONTEXT, bundle_name) is not None, 'Bundle should exist' api.commit(TEST_CONTEXT, bundle_name) api.push(TEST_CONTEXT, bundle_name) # Blow away context and recreate api.delete_context(TEST_CONTEXT) assert TEST_CONTEXT not in api.ls_contexts() api.context(context_name=TEST_CONTEXT) api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL, force=True) assert api.search(TEST_CONTEXT) == [], 'Context should be empty' # Pull bundles from remote api.pull(TEST_CONTEXT) # Make sure all bundle meta data comes down but data remains in S3 for bundle_name in ['a', 'b', 'c']: bundle = api.get(TEST_CONTEXT, bundle_name) assert bundle is not None, 'Bundle should exist' data_path = bundle.data['file'][0] assert data_path.startswith('s3://'), 'Data should be in S3' # Rerun pipeline api.apply(TEST_CONTEXT, BIP, params={'n': 100}, incremental_pull=True) # Make sure all bundles exist. Bundles a and b should have local paths for bundle_name in ['a', 'b', 'c']: bundle = api.get(TEST_CONTEXT, bundle_name) assert bundle is not None, 'Bundle should exist' data_path = bundle.data['file'][0] if bundle_name in ['a', 'b']: assert not data_path.startswith('s3://'), 'Data should be local' else: assert data_path.startswith('s3://'), 'Data should be in S3' api.delete_context(TEST_CONTEXT)