def test_ABC7(run_test): """ 7.) Run A->B->C, Run A*->B. Run A->B->C, nothing should run Args: run_test: Returns: """ result = api.apply(TEST_CONTEXT, C) assert result['success'] is True assert result['did_work'] is True B_uuid = api.get(TEST_CONTEXT, 'B').uuid def custom_B_requires(self): self.add_dependency('a', APrime, params={}) old_requires = B.pipe_requires B.pipe_requires = custom_B_requires result = api.apply(TEST_CONTEXT, B) assert result['success'] is True assert result['did_work'] is True assert B_uuid != api.get(TEST_CONTEXT, 'B').uuid # should have a new B B.pipe_requires = old_requires result = api.apply(TEST_CONTEXT, C) assert result['success'] is True assert result['did_work'] is False
def test(run_test): """ Purpose of this test is to have one task that produces a bundle. And another task that requires it. 1.) Run DataMaker which runs PreMaker 2.) Assert that those ran, and remove PreMaker 3.) run Root_1 which needs DataMaker (external dep) and PreMaker 4.) assert that premaker re-ran and root ran successfully (getting external dependency) """ api.context(TEST_CONTEXT) api.apply(TEST_CONTEXT, DataMaker, params={'int_array': [1000, 2000, 3000]}) b = api.get(TEST_CONTEXT, 'PreMaker') assert (b is not None) pm_uuid = b.uuid b.rm() api.apply(TEST_CONTEXT, Root_1) b = api.get(TEST_CONTEXT, 'PreMaker') assert (b is not None) assert (b.uuid != pm_uuid) b = api.get(TEST_CONTEXT, 'Root_1') assert (b is not None) api.delete_context(TEST_CONTEXT)
def test(): """ Purpose of this test is to have one task that produces a bundle. And another task that requires it. 1.) Create external dep -- also creates PreMaker_auf_datamaker dsdt apply - - test_external_bundle.DataMaker --int_array '[1000,2000,3000]' 2.) Remove Premaker_auf_datamaker dsdt rm PreMaker_auf_datamaker 3.) Try to run Root -- it should find DataMaker but not re-create it or PreMaker_auf_datamaker """ api.context(TEST_CONTEXT) api.apply(TEST_CONTEXT, '-', '-', 'DataMaker', params={'int_array': '[1000,2000,3000]'}) b = api.get(TEST_CONTEXT, 'PreMaker_auf_datamaker') assert (b is not None) b.rm() api.apply(TEST_CONTEXT, '-', '-', 'Root') b = api.get(TEST_CONTEXT, 'PreMaker_auf_root') assert (b is not None) api.delete_context(TEST_CONTEXT)
def test_AB6(run_test): """ 6.) Run A->B, Re-run A*. Run A*->B, B should re-run. Args: run_test: Returns: """ result = api.apply(TEST_CONTEXT, B) assert result['success'] is True assert result['did_work'] is True B_uuid = api.get(TEST_CONTEXT, 'B').uuid result = api.apply(TEST_CONTEXT, APrime) assert result['success'] is True assert result['did_work'] is True APrime_uuid = api.get(TEST_CONTEXT, 'APrime').uuid def custom_B_requires(self): self.add_dependency('a', APrime, params={}) old_requires = B.pipe_requires B.pipe_requires = custom_B_requires result = api.apply(TEST_CONTEXT, B) assert result['success'] is True assert result['did_work'] is True assert APrime_uuid == api.get(TEST_CONTEXT, 'APrime').uuid assert B_uuid != api.get(TEST_CONTEXT, 'B').uuid B.pipe_requires = old_requires
def test(run_test): """ This tests if apply force=True and force_all=True re-run everything. We have two tasks. One depends on the other. force_all should re-run both, force should re-run only the last. """ # first run there should be no bundles #assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, A, params={}) first_B_uuid = api.get(TEST_CONTEXT, 'B').uuid first_A_uuid = api.get(TEST_CONTEXT, 'A').uuid # second, force re-run last task api.apply(TEST_CONTEXT, A, force=True, params={}) one_B_uuid = api.get(TEST_CONTEXT, 'B').uuid one_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert (first_B_uuid == one_B_uuid) assert (first_A_uuid != one_A_uuid) # second, force all to re-run. api.apply(TEST_CONTEXT, A, force_all=True, params={}) all_B_uuid = api.get(TEST_CONTEXT, 'B').uuid all_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert (all_B_uuid != one_B_uuid) assert (all_A_uuid != one_A_uuid) # third, make sure a force_all doesn't crash if there is an external bundle. api.apply(TEST_CONTEXT, A, force_all=True, params={'set_ext_dep': True}) final_B_uuid = api.get(TEST_CONTEXT, 'B').uuid final_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert (final_B_uuid == all_B_uuid) assert (final_A_uuid != all_A_uuid)
def bundle_inputs(self): """ Given this pipe, return the set of bundles that this task used as input. Return a list of tuples that contain (processing_name, uuid, arg_name) NOTE: Calls task.deps which calls task._requires which calls task.requires() Args: self (disdat.PipeTask): The pipe task in question Returns: (dict(str:`disdat.api.Bundle`)): {arg_name: bundle, ...} """ input_bundles = {} for task in self.deps(): if isinstance(task, ExternalDepTask): b = api.get(self.data_context.get_local_name(), None, uuid=task.uuid) else: b = PathCache.get_path_cache(task).bundle assert b is not None input_bundles[task.user_arg_name] = b return input_bundles
def test_push(run_test): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3', region_name='us-east-1') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) _ = api.Bundle(TEST_CONTEXT, name='remote_test', data='Hello') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' bucket.objects.all().delete() bucket.delete()
def test_push(): api.context(context_name=TEST_CONTEXT) s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL, force=True) api.apply(TEST_CONTEXT, 'RemoteTest') bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' bucket.objects.all().delete() bucket.delete() api.delete_context(context_name=TEST_CONTEXT)
def test_pull(run_test): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) bucket = s3_resource.Bucket(TEST_BUCKET) objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) api.apply(TEST_CONTEXT, RemoteTest) bundle = api.get(TEST_CONTEXT, 'remote_test') assert bundle.data == 'Hello' bundle.commit() bundle.push() objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' in objects, 'Bucket should not be empty' assert len(objects['Contents']) > 0, 'Bucket should not be empty' api.delete_context(context_name=TEST_CONTEXT) api.context(context_name=TEST_CONTEXT) api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL) api.pull(TEST_CONTEXT) pulled_bundles = api.search(TEST_CONTEXT) assert len(pulled_bundles) > 0, 'Pulled bundles down' assert pulled_bundles[0].data == 'Hello', 'Bundle contains correct data' bucket.objects.all().delete() bucket.delete()
def manual_test_run_aws_batch(run_test, build_container_setup_only): """ Incomplete test. The container code itself needs to have its S3 access mocked out. Here we are testing manually """ # Setup moto s3 resources #s3_resource = boto3.resource('s3') #s3_resource.create_bucket(Bucket=TEST_BUCKET) # Add a remote. Pull and Push! manual_s3_url = 's3://' api.remote(TEST_CONTEXT, TEST_CONTEXT, manual_s3_url) retval = api.run(SETUP_DIR, TEST_CONTEXT, PIPELINE_CLS, remote_context=TEST_CONTEXT, remote_s3_url=manual_s3_url, pull=True, push=True) # Blow away everything and pull api.rm(TEST_CONTEXT, bundle_name='.*', rm_all=True) api.pull(TEST_CONTEXT) b = api.get(TEST_CONTEXT, 'A') assert b.data == sum(COMMON_DEFAULT_ARGS)
def deprecated_data_as_bundle_not_csv(tmpdir): # Create Context api.context(TEST_CONTEXT) # Create test .txt file test_txt_path = os.path.join(str(tmpdir), 'test.txt') with open(test_txt_path, 'w') as f: f.write('this should not create a bundle') # Assert the txt file exists assert os.path.exists(test_txt_path) # Try to add file to the bundle with pytest.raises(AssertionError) as ex: api.add(TEST_CONTEXT, 'bad_path', test_txt_path, treat_file_as_bundle=True) # Assert Exited with error code of 1 assert ex.type == AssertionError # Make sure bundle does not exist assert api.get( TEST_CONTEXT, 'test_file_as_bundle_txt_file') is None, 'Bundle should not exist' api.delete_context(TEST_CONTEXT)
def test_task_with_parameter(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'B', params={'n': 10}) data = api.get(TEST_CONTEXT, 'b').data assert data == 20, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 1, 'One bundle should be present' api.apply(TEST_CONTEXT, 'B', params={'n': 20}) data = api.get(TEST_CONTEXT, 'b').data assert data == 40, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 2, 'Two bundles should be present'
def create_remote_file_bundle(name): """ Create a bundle with a.) an unmanaged s3 path b.) a managed s3 path c.) a managed s3 path with a directory """ s3_resource = boto3.resource('s3', region_name='us-east-1') s3_resource.create_bucket(Bucket=TEST_BUCKET) # Copy a local file to moto s3 bucket saved_md5 = md5_file(__file__) aws_s3.put_s3_file(__file__, TEST_BUCKET_URL) s3_path_1 = os.path.join(TEST_BUCKET_URL, os.path.basename(__file__)) with api.Bundle(TEST_CONTEXT, name=name) as b: s3_path_2 = b.get_remote_file('test_s3_file.txt') aws_s3.cp_local_to_s3_file(__file__, s3_path_2) s3_path_3 = os.path.join(b.get_remote_directory('vince/klartho'), 'test_s3_file.txt') aws_s3.cp_local_to_s3_file(__file__, s3_path_3) b.add_data([s3_path_1, s3_path_2, s3_path_3]) b.add_tags({'info': 'added an s3 file'}) saved_uuid = b.uuid b = api.get(TEST_CONTEXT, None, uuid=saved_uuid) b.commit() md5 = md5_file(b.data[0]) print(md5) print(saved_md5) assert md5 == saved_md5
def test_child_task_with_parameter(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'C', params={'n': 10}) data = api.get(TEST_CONTEXT, 'c').data assert data == 22, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present' api.apply(TEST_CONTEXT, 'C', params={'n': 20}) data = api.get(TEST_CONTEXT, 'c').data assert data == 42, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len(api.search(TEST_CONTEXT)) == 5, 'Five bundles should be present'
def test_list_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='list_task', data=[1, 2, 3]) data = api.get(TEST_CONTEXT, 'list_task').data assert np.array_equal(data, [1, 2, 3]), 'Data did not match output' assert type(data) == np.ndarray, 'Data is not list' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_float_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='float_task', data=2.5) data = api.get(TEST_CONTEXT, 'float_task').data assert data == 2.5, 'Data did not match output' assert type(data) == float, 'Data is not float' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_dependant_tasks(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'C') data = api.get(TEST_CONTEXT, 'c').data assert data == 6, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 3, 'Three bundles should be present'
def test_single_file(tmpdir): # Create Context api.context(TEST_CONTEXT) # Create test .csv file test_csv_path = os.path.join(str(tmpdir), 'test.csv') df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) df.to_csv(test_csv_path) # Assert csv_file_exits assert os.path.exists(test_csv_path) # Add the file to the bundle api.add(TEST_CONTEXT, 'test_single_file', test_csv_path) # Retrieve the bundle b = api.get(TEST_CONTEXT, 'test_single_file') # Assert the bundles contain the same data bundle_hash, file_hash = get_hash(b.data), get_hash(test_csv_path) assert bundle_hash == file_hash, 'Hashes do not match' # Test with tags tag = {'test': 'tag'} api.add(TEST_CONTEXT, 'test_single_file', test_csv_path, tags=tag) # Retrieve the bundle b = api.get(TEST_CONTEXT, 'test_single_file') # Assert the bundles contain the same data bundle_hash, file_hash = get_hash(b.data), get_hash(test_csv_path) assert bundle_hash == file_hash, 'Hashes do not match' assert b.tags == tag, 'Tags do not match' # Remove test .csv os.remove(test_csv_path) # Assert that data still remains in the bundle assert api.get(TEST_CONTEXT, 'test_single_file') is not None, 'Bundle should exist' api.delete_context(TEST_CONTEXT)
def create_bundle_from_pipeline(): """ Run the internal pipeline, create a bundle, return the uuid """ api.apply(TEST_CONTEXT, ExternalPipeline, params={'test_param': EXT_TASK_PARAM_VAL}, output_bundle=EXT_BUNDLE_NAME) b = api.get(TEST_CONTEXT, EXT_BUNDLE_NAME) return b.uuid
def test_int_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, IntTask) data = api.get(TEST_CONTEXT, 'int_task').data assert data == 1, 'Data did not match output' assert type(data) == int, 'Data is not int' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_string_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' _ = api.Bundle(TEST_CONTEXT, name='string_task', data='output') data = api.get(TEST_CONTEXT, 'string_task').data assert data == 'output', 'Data did not match output' assert type(data) == six.text_type, 'Data is not string' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_dict_task(): setup() assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, DictTask) data = api.get(TEST_CONTEXT, 'dict_task').data assert data == {'hello': ['world']}, 'Data did not match output' assert type(data) == dict, 'Data is not dict' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_args_bundle(): """ Create bundle, store args. """ with api.Bundle(TEST_CONTEXT) as b: b.add_params(serialized_json_args) b.name = 'output' b = api.get(TEST_CONTEXT, 'output') assert (b.params == serialized_json_args)
def test_A2_A3(run_test): """ 2.) Run A, Run A, should re-use 3.) Run A, Run A*, should re-run """ result = api.apply(TEST_CONTEXT, A) assert result['did_work'] is True first_A_uuid = api.get(TEST_CONTEXT, 'A').uuid result = api.apply(TEST_CONTEXT, A) assert result['did_work'] is False second_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert first_A_uuid == second_A_uuid assert len(api.search(TEST_CONTEXT, 'A')) is 1 # Mod args, should re-run result = api.apply(TEST_CONTEXT, A, params={'a': 2, 'b': 3}) assert result['did_work'] is True next_A_uuid = api.get(TEST_CONTEXT, 'A').uuid assert next_A_uuid != second_A_uuid assert len(api.search(TEST_CONTEXT, 'A')) is 2
def test_dict_task(): setup() assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' d = {'hello': ['world']} _ = api.Bundle(TEST_CONTEXT, name='dict_task', data=d) d = api.get(TEST_CONTEXT, 'dict_task').data assert d == {'hello': ['world']}, 'Data did not match output' assert type(d) == dict, 'Data is not dict' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_file_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, FileTask) output_path = api.get(TEST_CONTEXT, 'file_task').data with open(output_path) as f: output = f.read() assert output == '5', 'Data did not match output' assert type(output_path) == str, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_df_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' df = pd.DataFrame() df['a'] = [1, 2, 3] _ = api.Bundle(TEST_CONTEXT, name='df_task', data=df) data = api.get(TEST_CONTEXT, 'df_task').data assert df.equals(data), 'Data did not match output' assert type(data) == pd.DataFrame, 'Data is not df' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def test_single_task(): assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty' api.apply(TEST_CONTEXT, 'A') data = api.get(TEST_CONTEXT, 'a').data assert data == 2, 'Data did not match output' assert type(data) == int, 'Data is not path' assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present' api.apply(TEST_CONTEXT, 'A') assert len( api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
def prepare_pipe_kwargs(self, for_run=False): """ Each upstream task produces a bundle. Prepare that bundle as input to the user's pipe_run function. Args: for_run (bool): prepare args for run -- at that point all upstream tasks have completed. Returns: (dict): A dictionary with the arguments. """ kwargs = dict() # Place upstream task outputs into the kwargs. Thus the user does not call # self.inputs(). If they did, they would get a list of output targets for the bundle if for_run: # Reset the stored tags, in case this instance is run multiple times. self._input_tags = {} self._input_bundle_uuids = {} upstream_tasks = [(t.user_arg_name, PathCache.get_path_cache(t)) for t in self.deps()] for user_arg_name, pce in [ u for u in upstream_tasks if u[1] is not None ]: b = api.get(self.data_context.get_local_name(), None, uuid=pce.uuid) assert b.is_presentable # Download data that is not local (the linked files are not present). # This is the default behavior when running in a container. if self.incremental_pull: b.pull(localize=True) if pce.instance.user_arg_name in kwargs: _logger.warning( 'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed' .format(pce.instance.user_arg_name)) self._input_tags[user_arg_name] = b.tags self._input_bundle_uuids[user_arg_name] = pce.uuid kwargs[user_arg_name] = b.data return kwargs
def test_luigi_args(run_test): """ Create a task, store args, retrieve from bundle api. Pass in python objects as the values for Luigi parameters. Stored as serialized json objects. Bundle presents the parameters as the serialized objects (Disdat isn't aware they were Luigi serialized). """ api.apply(TEST_CONTEXT, ArgTask, output_bundle='output', params=test_luigi_args_data) b = api.get(TEST_CONTEXT, 'output') found_p = {} for k, p in b.params.items(): attribute = getattr(ArgTask, k) found_p[k] = attribute.parse(p) assert (found_p == test_luigi_args_data)