def _read_schema_from_pipeline_root(self, pipeline_name, pipeline_root): # Check if pipeline root created. If not, it means that the user has not # created a run yet or the pipeline is still running for the first time. if not fileio.exists(pipeline_root): sys.exit( 'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.' ) # If pipeline_root exists, then check if SchemaGen output exists. components = fileio.listdir(pipeline_root) if 'SchemaGen' not in components: sys.exit( 'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.' ) # Get the latest SchemaGen output. component_output_dir = os.path.join(pipeline_root, 'SchemaGen') schema_dir = os.path.join(component_output_dir, 'schema') schemagen_outputs = fileio.listdir(schema_dir) latest_schema_folder = max(schemagen_outputs, key=int) # Copy schema to current dir. latest_schema_uri = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', int(latest_schema_folder)) latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt') curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt') io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True) # Print schema and path to schema click.echo('Path to schema: {}'.format(curr_dir_path)) click.echo('*********SCHEMA FOR {}**********'.format( pipeline_name.upper())) with open(curr_dir_path, 'r') as f: click.echo(f.read())
def testTrainerFn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') trainer_fn_args = trainer_executor.TrainerFnArgs( train_files=os.path.join( self._testdata_path, 'transform/transformed_examples/train/*.gz'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join( self._testdata_path, 'transform/transformed_examples/eval/*.gz'), schema_file=schema_file, train_steps=1, eval_steps=1, base_model=os.path.join(self._testdata_path, 'trainer/current/serving_model_dir'), data_accessor=DataAccessor(tf_dataset_factory=tfxio_utils. get_tf_dataset_factory_from_artifact( [standard_artifacts.Examples()], []), record_batch_factory=None, data_view_decode_fn=None)) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils_bqml.trainer_fn(trainer_fn_args, schema) estimator = training_spec['estimator'] train_spec = training_spec['train_spec'] eval_spec = training_spec['eval_spec'] eval_input_receiver_fn = training_spec['eval_input_receiver_fn'] self.assertIsInstance(estimator, tf.estimator.Estimator) self.assertIsInstance(train_spec, tf.estimator.TrainSpec) self.assertIsInstance(eval_spec, tf.estimator.EvalSpec) self.assertIsInstance(eval_input_receiver_fn, types.FunctionType) # Train for one step, then eval for one step. eval_result, exports = tf.estimator.train_and_evaluate( estimator, train_spec, eval_spec) self.assertGreater(eval_result['loss'], 0.0) self.assertEqual(len(exports), 1) self.assertGreaterEqual(len(fileio.listdir(exports[0])), 1) # Export the eval saved model. eval_savedmodel_path = tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(temp_dir), eval_input_receiver_fn=eval_input_receiver_fn) self.assertGreaterEqual(len(fileio.listdir(eval_savedmodel_path)), 1) # Test exported serving graph. with tf.compat.v1.Session() as sess: metagraph_def = tf.compat.v1.saved_model.loader.load( sess, [tf.saved_model.SERVING], exports[0]) self.assertIsInstance(metagraph_def, tf.compat.v1.MetaGraphDef)
def assertExecutedOnce(self, component: Text) -> None: """Check the component is executed exactly once.""" component_path = os.path.join(self._pipeline_root, component) self.assertTrue(fileio.exists(component_path)) outputs = fileio.listdir(component_path) for output in outputs: execution = fileio.listdir(os.path.join(component_path, output)) self.assertEqual(1, len(execution))
def _generate_blessing_result(self, eval_examples_uri: Text, slice_spec: List[ tfma.slicer.SingleSliceSpec], current_model_dir: Text, blessed_model_dir: Text) -> bool: current_model_eval_result_path = os.path.join( self._temp_path, constants.CURRENT_MODEL_EVAL_RESULT_PATH) blessed_model_eval_result_path = os.path.join( self._temp_path, constants.BLESSED_MODEL_EVAL_RESULT_PATH) with self._make_beam_pipeline() as pipeline: eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(eval_examples_uri))) current_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( current_model_dir)) (eval_data | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=current_model, slice_spec=slice_spec, output_path=current_model_eval_result_path)) if blessed_model_dir is not None: blessed_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( blessed_model_dir)) (eval_data | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=blessed_model, slice_spec=slice_spec, output_path=blessed_model_eval_result_path)) absl.logging.info('all files in current_model_eval_result_path: [%s]', str(fileio.listdir(current_model_eval_result_path))) current_model_eval_result = tfma.load_eval_result( output_path=current_model_eval_result_path) if not self._pass_threshold(current_model_eval_result): absl.logging.info('Current model does not pass threshold.') return False absl.logging.info('Current model passes threshold.') if blessed_model_dir is None: absl.logging.info('No blessed model yet.') return True absl.logging.info('all files in blessed_model_eval_result: [%s]', str(fileio.listdir(blessed_model_eval_result_path))) blessed_model_eval_result = tfma.load_eval_result( output_path=blessed_model_eval_result_path) if (self._compare_eval_result(current_model_eval_result, blessed_model_eval_result)): absl.logging.info('Current model better than blessed model.') return True else: absl.logging.info('Current model worse than blessed model.') return False
def setUp(self): super().setUp() # List of packages installed. self._pip_list = pip_utils.get_package_names() # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join(self.tmp_dir, 'airflow') self.enter_context( test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home)) self.enter_context( test_case_utils.override_env_var('HOME', self._airflow_home)) absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._pipeline_name = 'chicago_taxi_simple' self._pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_airflow_1.py') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): """Make sure the number of trainer executions and output models.""" # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = fileio.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) eval_model_dir = path_utils.eval_model_dir(model_uri) serving_model_dir = path_utils.serving_model_dir(model_uri) self.assertEqual(1, fileio.listdir(eval_model_dir).count('saved_model.pb')) self.assertEqual(1, fileio.listdir(serving_model_dir).count('saved_model.pb'))
def testRecordBeamPipelineRunId(self, mock_metadata, mock_config): # Tests recording Beam pipeline outputs given a run_id. with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict', return_value=self.execution_dict ) as mock_get_execution_dict,\ mock.patch.object(pipeline_recorder_utils, '_get_paths', return_value=self.paths ) as mock_get_paths: pipeline_recorder_utils.record_pipeline( output_dir=self._base_dir, metadata_db_uri=self.metadata_db_uri, run_id=self.run_id) mock_config.assert_called_with(self.metadata_db_uri) mock_metadata.assert_called() mock_get_execution_dict.assert_called() mock_get_paths.assert_called() # Verifying that test.txt has been copied from src_uri to dest_uri files = fileio.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') statistics_artifact = standard_artifacts.ExampleStatistics() statistics_artifact.uri = os.path.join(source_data_dir, 'statistics_gen') statistics_artifact.split_names = artifact_utils.encode_split_names( ['train', 'eval', 'test']) output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_output = standard_artifacts.Schema() schema_output.uri = os.path.join(output_data_dir, 'schema_output') input_dict = { standard_component_specs.STATISTICS_KEY: [statistics_artifact], } exec_properties = { # List needs to be serialized before being passed into Do function. standard_component_specs.EXCLUDE_SPLITS_KEY: json_utils.dumps(['test']) } output_dict = { standard_component_specs.SCHEMA_KEY: [schema_output], } schema_gen_executor = executor.Executor() schema_gen_executor.Do(input_dict, output_dict, exec_properties) self.assertNotEqual(0, len(fileio.listdir(schema_output.uri)))
def clear_output_dirs(output_dict: Dict[str, List[types.Artifact]]) -> None: """Clear dirs of output artifacts' URI.""" for _, artifact_list in output_dict.items(): for artifact in artifact_list: if fileio.isdir(artifact.uri) and fileio.listdir(artifact.uri): fileio.rmtree(artifact.uri) fileio.mkdir(artifact.uri)
def _cleanup_kfp_server(self): pipelines = fileio.listdir(self._kubeflow_home) for pipeline_name in pipelines: if fileio.isdir(pipeline_name): self._delete_experiment(pipeline_name) self._delete_pipeline(pipeline_name) self._delete_pipeline_output(pipeline_name) self._delete_pipeline_metadata(pipeline_name)
def assertExecutedOnce(self, component: Text) -> None: """Check the component is executed exactly once.""" component_path = os.path.join(self._pipeline_root, component) self.assertTrue(fileio.exists(component_path)) execution_path = os.path.join( component_path, '.system', 'executor_execution') execution = fileio.listdir(execution_path) self.assertLen(execution, 1)
def get_only_uri_in_dir(dir_path: Text) -> Text: """Gets the only uri from given directory.""" files = fileio.listdir(dir_path) if len(files) != 1: raise RuntimeError( 'Only one file per dir is supported: {}.'.format(dir_path)) filename = os.path.dirname(os.path.join(files[0], '')) return os.path.join(dir_path, filename)
def assertExecutedOnce(self, component: Text) -> None: """Check the component is executed exactly once.""" component_path = os.path.join(self._pipeline_root, component) self.assertTrue(fileio.exists(component_path)) outputs = fileio.listdir(component_path) self.assertIn('.system', outputs) outputs.remove('.system') system_paths = [ os.path.join('.system', path) for path in fileio.listdir(os.path.join(component_path, '.system')) ] self.assertNotEmpty(system_paths) self.assertIn('.system/executor_execution', system_paths) outputs.extend(system_paths) for output in outputs: execution = fileio.listdir(os.path.join(component_path, output)) self.assertLen(execution, 1)
def extractDirectorySpec(self, path): if fileio.isdir(path): result = {} for name in fileio.listdir(path): result[name] = self.extractDirectorySpec(os.path.join(path, name)) return result elif fileio.exists(path): return file_io.FileIO(path, mode='r').read() else: raise ValueError(f'{path} does not exist.')
def setUp(self): super(CliLocalEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup local_home in a temp directory self._home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._home self._old_local_home = os.environ.get('LOCAL_HOME') os.environ['LOCAL_HOME'] = os.path.join(self._home, 'local', '') self._local_home = os.environ['LOCAL_HOME'] # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def assertInfraValidatorPassed(self) -> None: infra_validator_path = os.path.join(self._pipeline_root, 'InfraValidator') blessing_path = os.path.join(self._pipeline_root, 'InfraValidator', 'blessing') executions = fileio.listdir(blessing_path) self.assertGreaterEqual(len(executions), 1) for exec_id in executions: blessing_uri = base_driver._generate_output_uri( # pylint: disable=protected-access infra_validator_path, 'blessing', exec_id) blessed = os.path.join(blessing_uri, 'INFRA_BLESSED') self.assertTrue(fileio.exists(blessed))
def list_pipelines(self) -> None: """List all the pipelines in the environment.""" if not fileio.exists(self._handler_home_dir): click.echo('No pipelines to display.') return pipelines_list = fileio.listdir(self._handler_home_dir) # Print every pipeline name in a new line. click.echo('-' * 30) click.echo('\n'.join(pipelines_list)) click.echo('-' * 30)
def setUp(self): super().setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup beam_home in a temp directory self._home = self.tmp_dir self._beam_home = os.path.join(self._home, 'beam') self.enter_context( test_case_utils.override_env_var('BEAM_HOME', self._beam_home)) self.enter_context(test_case_utils.override_env_var( 'HOME', self._home)) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def build_ephemeral_package() -> Text: """Repackage current installation of TFX into a tfx_ephemeral sdist. Returns: Path to ephemeral sdist package. Raises: RuntimeError: if dist directory has zero or multiple files. """ tmp_dir = os.path.join(tempfile.mkdtemp(), 'build', 'tfx') # Find the last directory named 'tfx' in this file's path and package it. path_split = __file__.split(os.path.sep) last_index = -1 for i in range(len(path_split)): if path_split[i] == 'tfx': last_index = i if last_index < 0: raise RuntimeError('Cannot locate directory \'tfx\' in the path %s' % __file__) tfx_root_dir = os.path.sep.join(path_split[0:last_index + 1]) absl.logging.info('Copying all content from install dir %s to temp dir %s', tfx_root_dir, tmp_dir) shutil.copytree(tfx_root_dir, os.path.join(tmp_dir, 'tfx')) # Source directory default permission is 0555 but we need to be able to create # new setup.py file. os.chmod(tmp_dir, 0o720) setup_file = os.path.join(tmp_dir, 'setup.py') absl.logging.info('Generating a temp setup file at %s', setup_file) install_requires = dependencies.make_required_install_packages() io_utils.write_string_file( setup_file, _ephemeral_setup_file.format(version=version.__version__, install_requires=install_requires)) # Create the package curdir = os.getcwd() os.chdir(tmp_dir) temp_log = os.path.join(tmp_dir, 'setup.log') with open(temp_log, 'w') as f: absl.logging.info( 'Creating temporary sdist package, logs available at %s', temp_log) cmd = [sys.executable, setup_file, 'sdist'] subprocess.call(cmd, stdout=f, stderr=f) os.chdir(curdir) # Return the package dir+filename dist_dir = os.path.join(tmp_dir, 'dist') files = fileio.listdir(dist_dir) if not files: raise RuntimeError('Found no package files in %s' % dist_dir) elif len(files) > 1: raise RuntimeError('Found multiple package files in %s' % dist_dir) return os.path.join(dist_dir, files[0])
def assertPushed(self): self.assertGreater(self._GetNumberOfFiles(self._serving_model_dir), 0) pushed_path = os.path.join(self._serving_model_dir, fileio.listdir(self._serving_model_dir)[0]) self.assertGreater(self._GetNumberOfFiles(pushed_path), 0) model_path = self._executor.GetModelPath(self._input_dict) self.assertEqual(self._GetNumberOfFiles(pushed_path), self._GetNumberOfFiles(model_path)) self.assertEqual(self._GetNumberOfFiles(self._model_push.uri), self._GetNumberOfFiles(model_path)) self.assertEqual(1, self._model_push.get_int_custom_property('pushed'))
def _assertHyperparametersAreWritten(self, pipeline_name): """Make sure the tuner execution and hyperpearameters output.""" # There must be only one execution of Tuner. tuner_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Tuner', 'best_hyperparameters') tuner_outputs = fileio.listdir(tuner_output_base_dir) self.assertEqual(1, len(tuner_outputs)) # There must be only one best hyperparameters. best_hyperparameters_uri = os.path.join(tuner_output_base_dir, tuner_outputs[0]) self.assertTrue(fileio.exists(best_hyperparameters_uri))
def list_pipelines(self) -> None: """List all the pipelines in the environment.""" # There is no managed storage for pipeline packages, so CLI consults # local dir to list pipelines. if not fileio.exists(self._handler_home_dir): click.echo('No pipelines to display.') return pipelines_list = fileio.listdir(self._handler_home_dir) # Print every pipeline name in a new line. click.echo('-' * 30) click.echo('\n'.join(pipelines_list)) click.echo('-' * 30)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Copy the input_data to the output_data. For this example that is all that the Executor does. For a different custom component, this is where the real functionality of the component would be included. This component both reads and writes Examples, but a different component might read and write artifacts of other types. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of type `standard_artifacts.Examples` which will often contain two splits, 'train' and 'eval'. output_dict: Output dict from key to a list of artifacts, including: - output_data: A list of type `standard_artifacts.Examples` which will usually contain the same splits as input_data. exec_properties: A dict of execution properties, including: - name: Optional unique name. Necessary iff multiple Hello components are declared in the same pipeline. Returns: None Raises: OSError and its subclasses """ self._log_startup(input_dict, output_dict, exec_properties) input_artifact = artifact_utils.get_single_instance( input_dict['input_data']) output_artifact = artifact_utils.get_single_instance( output_dict['output_data']) output_artifact.split_names = input_artifact.split_names split_to_instance = {} for split in json.loads(input_artifact.split_names): uri = artifact_utils.get_split_uri([input_artifact], split) split_to_instance[split] = uri for split, instance in split_to_instance.items(): input_dir = instance output_dir = artifact_utils.get_split_uri([output_artifact], split) for filename in fileio.listdir(input_dir): input_uri = os.path.join(input_dir, filename) output_uri = os.path.join(output_dir, filename) io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
def testRecordLatestKfpPipeline(self, mock_get_latest_executions): # Tests recording KFP pipeline outputs for the latest execution. with mock.patch.object( pipeline_recorder_utils, '_get_paths', return_value=self.paths) as mock_get_paths: pipeline_recorder_utils.record_pipeline( output_dir=self._base_dir, host=self.host, port=self.port, pipeline_name=self.pipeline_name) mock_get_paths.assert_called() mock_get_latest_executions.assert_called() files = fileio.listdir(self.dest_uri) self.assertLen(files, 1) self.assertEqual( io_utils.read_string_file(os.path.join(self.dest_uri, files[0])), self.content)
def testMakeClearAndRemoveOutputDirs(self): output_artifacts = self._output_resolver().generate_output_artifacts(1) outputs_utils.make_output_dirs(output_artifacts) for _, artifact_list in output_artifacts.items(): for artifact in artifact_list: if isinstance(artifact, ValueArtifact): self.assertFalse(fileio.isdir(artifact.uri)) else: self.assertTrue(fileio.isdir(artifact.uri)) with fileio.open(os.path.join(artifact.uri, 'output'), 'w') as f: f.write('') self.assertTrue(fileio.exists(artifact.uri)) outputs_utils.clear_output_dirs(output_artifacts) for _, artifact_list in output_artifacts.items(): for artifact in artifact_list: if not isinstance(artifact, ValueArtifact): self.assertEqual(fileio.listdir(artifact.uri), []) outputs_utils.remove_output_dirs(output_artifacts) for _, artifact_list in output_artifacts.items(): for artifact in artifact_list: self.assertFalse(fileio.exists(artifact.uri))
def assertDirectoryNotEmpty(self, path): self.assertGreater(len(fileio.listdir(path)), 0)
def setUp(self): super(AirflowEndToEndTest, self).setUp() # setup airflow_home in a temp directory, config and init db. self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) self._mysql_container_name = 'airflow_' + test_utils.generate_random_id( ) db_port = airflow_test_utils.create_mysql_container( self._mysql_container_name) self.addCleanup(airflow_test_utils.delete_mysql_container, self._mysql_container_name) os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = ( 'mysql://[email protected]:%d/airflow' % db_port) # Set a couple of important environment variables. See # https://airflow.apache.org/howto/set-config.html for details. os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join( self._airflow_home, 'dags') os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join( self._airflow_home, 'logs') # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' # Following fields are specific to the chicago_taxi_simple example. self._dag_id = 'chicago_taxi_simple' self._run_id = 'manual_run_id_1' # This execution date must be after the start_date in chicago_taxi_simple # but before current execution date. self._execution_date = '2019-02-01T01:01:01' self._all_tasks = [ 'CsvExampleGen', 'Evaluator', 'ExampleValidator', 'Pusher', 'SchemaGen', 'StatisticsGen', 'Trainer', 'Transform', ] # Copy dag file and data. chicago_taxi_pipeline_dir = os.path.dirname(__file__) simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir, 'taxi_pipeline_simple.py') io_utils.copy_file( simple_pipeline_file, os.path.join(self._airflow_home, 'dags', 'taxi_pipeline_simple.py')) data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. subprocess.run(['airflow', 'initdb'], check=True) subprocess.run(['airflow', 'unpause', self._dag_id], check=True)
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = str( subprocess.check_output(['pip', 'freeze', '--local'])) # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName, 'airflow') self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._pipeline_name = 'chicago_taxi_simple' self._pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_airflow_1.py') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) self._mysql_container_name = 'airflow_' + test_utils.generate_random_id( ) db_port = airflow_test_utils.create_mysql_container( self._mysql_container_name) self.addCleanup(self._cleanup_mysql_container) os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = ( 'mysql://[email protected]:%d/airflow' % db_port) # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' self._airflow_initdb() # Initialize CLI runner. self.runner = click_testing.CliRunner()
def _verify_transform_outputs(self, materialize=True, store_cache=True, multiple_example_inputs=False, compute_statistics=False): expected_outputs = ['transformed_graph'] if store_cache: expected_outputs.append('CACHE') self.assertNotEqual( 0, len(fileio.listdir(self._updated_analyzer_cache_artifact.uri))) example_artifacts = self._example_artifacts[:1] transformed_example_artifacts = self._transformed_example_artifacts[:1] if multiple_example_inputs: example_artifacts = self._example_artifacts transformed_example_artifacts = self._transformed_example_artifacts if materialize: expected_outputs.append('transformed_examples') assert len(example_artifacts) == len(transformed_example_artifacts) for example, transformed_example in zip( example_artifacts, transformed_example_artifacts): examples_train_files = fileio.glob( os.path.join(example.uri, 'Split-train', '*')) transformed_train_files = fileio.glob( os.path.join(transformed_example.uri, 'Split-train', '*')) self.assertGreater(len(transformed_train_files), 0) examples_eval_files = fileio.glob( os.path.join(example.uri, 'Split-eval', '*')) transformed_eval_files = fileio.glob( os.path.join(transformed_example.uri, 'Split-eval', '*')) self.assertGreater(len(transformed_eval_files), 0) # Construct datasets and count number of records in each split. examples_train_count = _get_dataset_size(examples_train_files) transformed_train_count = _get_dataset_size( transformed_train_files) examples_eval_count = _get_dataset_size(examples_eval_files) transformed_eval_count = _get_dataset_size( transformed_eval_files) # Check for each split that it contains the same number of records in # the input artifact as in the output artifact (i.e 1-to-1 mapping is # preserved). self.assertEqual(examples_train_count, transformed_train_count) self.assertEqual(examples_eval_count, transformed_eval_count) self.assertGreater(transformed_train_count, transformed_eval_count) path_to_pre_transform_statistics = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH) path_to_post_transform_statistics = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH) if compute_statistics: self.assertTrue(fileio.exists(path_to_pre_transform_statistics)) self.assertTrue(fileio.exists(path_to_post_transform_statistics)) else: self.assertFalse(fileio.exists(path_to_pre_transform_statistics)) self.assertFalse(fileio.exists(path_to_post_transform_statistics)) # Depending on `materialize` and `store_cache`, check that # expected outputs are exactly correct. If either flag is False, its # respective output should not be present. self.assertCountEqual(expected_outputs, fileio.listdir(self._output_data_dir)) path_to_saved_model = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.TRANSFORM_FN_DIR, tf.saved_model.SAVED_MODEL_FILENAME_PB) self.assertTrue(fileio.exists(path_to_saved_model))
def assertDirectoryEmpty(self, path): self.assertEqual(len(fileio.listdir(path)), 0)