Ejemplo n.º 1
0
    def _read_schema_from_pipeline_root(self, pipeline_name, pipeline_root):
        # Check if pipeline root created. If not, it means that the user has not
        # created a run yet or the pipeline is still running for the first time.

        if not fileio.exists(pipeline_root):
            sys.exit(
                'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.'
            )

        # If pipeline_root exists, then check if SchemaGen output exists.
        components = fileio.listdir(pipeline_root)
        if 'SchemaGen' not in components:
            sys.exit(
                'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.'
            )

        # Get the latest SchemaGen output.
        component_output_dir = os.path.join(pipeline_root, 'SchemaGen')
        schema_dir = os.path.join(component_output_dir, 'schema')
        schemagen_outputs = fileio.listdir(schema_dir)
        latest_schema_folder = max(schemagen_outputs, key=int)

        # Copy schema to current dir.
        latest_schema_uri = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', int(latest_schema_folder))
        latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt')
        curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt')
        io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True)

        # Print schema and path to schema
        click.echo('Path to schema: {}'.format(curr_dir_path))
        click.echo('*********SCHEMA FOR {}**********'.format(
            pipeline_name.upper()))
        with open(curr_dir_path, 'r') as f:
            click.echo(f.read())
Ejemplo n.º 2
0
    def testTrainerFn(self):
        temp_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        schema_file = os.path.join(self._testdata_path,
                                   'schema_gen/schema.pbtxt')
        trainer_fn_args = trainer_executor.TrainerFnArgs(
            train_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/train/*.gz'),
            transform_output=os.path.join(self._testdata_path,
                                          'transform/transform_output/'),
            serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'),
            eval_files=os.path.join(
                self._testdata_path,
                'transform/transformed_examples/eval/*.gz'),
            schema_file=schema_file,
            train_steps=1,
            eval_steps=1,
            base_model=os.path.join(self._testdata_path,
                                    'trainer/current/serving_model_dir'),
            data_accessor=DataAccessor(tf_dataset_factory=tfxio_utils.
                                       get_tf_dataset_factory_from_artifact(
                                           [standard_artifacts.Examples()],
                                           []),
                                       record_batch_factory=None,
                                       data_view_decode_fn=None))
        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
        training_spec = taxi_utils_bqml.trainer_fn(trainer_fn_args, schema)

        estimator = training_spec['estimator']
        train_spec = training_spec['train_spec']
        eval_spec = training_spec['eval_spec']
        eval_input_receiver_fn = training_spec['eval_input_receiver_fn']

        self.assertIsInstance(estimator, tf.estimator.Estimator)
        self.assertIsInstance(train_spec, tf.estimator.TrainSpec)
        self.assertIsInstance(eval_spec, tf.estimator.EvalSpec)
        self.assertIsInstance(eval_input_receiver_fn, types.FunctionType)

        # Train for one step, then eval for one step.
        eval_result, exports = tf.estimator.train_and_evaluate(
            estimator, train_spec, eval_spec)
        self.assertGreater(eval_result['loss'], 0.0)
        self.assertEqual(len(exports), 1)
        self.assertGreaterEqual(len(fileio.listdir(exports[0])), 1)

        # Export the eval saved model.
        eval_savedmodel_path = tfma.export.export_eval_savedmodel(
            estimator=estimator,
            export_dir_base=path_utils.eval_model_dir(temp_dir),
            eval_input_receiver_fn=eval_input_receiver_fn)
        self.assertGreaterEqual(len(fileio.listdir(eval_savedmodel_path)), 1)

        # Test exported serving graph.
        with tf.compat.v1.Session() as sess:
            metagraph_def = tf.compat.v1.saved_model.loader.load(
                sess, [tf.saved_model.SERVING], exports[0])
            self.assertIsInstance(metagraph_def, tf.compat.v1.MetaGraphDef)
Ejemplo n.º 3
0
 def assertExecutedOnce(self, component: Text) -> None:
     """Check the component is executed exactly once."""
     component_path = os.path.join(self._pipeline_root, component)
     self.assertTrue(fileio.exists(component_path))
     outputs = fileio.listdir(component_path)
     for output in outputs:
         execution = fileio.listdir(os.path.join(component_path, output))
         self.assertEqual(1, len(execution))
Ejemplo n.º 4
0
    def _generate_blessing_result(self, eval_examples_uri: Text,
                                  slice_spec: List[
                                      tfma.slicer.SingleSliceSpec],
                                  current_model_dir: Text,
                                  blessed_model_dir: Text) -> bool:
        current_model_eval_result_path = os.path.join(
            self._temp_path, constants.CURRENT_MODEL_EVAL_RESULT_PATH)
        blessed_model_eval_result_path = os.path.join(
            self._temp_path, constants.BLESSED_MODEL_EVAL_RESULT_PATH)

        with self._make_beam_pipeline() as pipeline:
            eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(
                file_pattern=io_utils.all_files_pattern(eval_examples_uri)))

            current_model = tfma.default_eval_shared_model(
                eval_saved_model_path=path_utils.eval_model_path(
                    current_model_dir))
            (eval_data
             | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                 eval_shared_model=current_model,
                 slice_spec=slice_spec,
                 output_path=current_model_eval_result_path))

            if blessed_model_dir is not None:
                blessed_model = tfma.default_eval_shared_model(
                    eval_saved_model_path=path_utils.eval_model_path(
                        blessed_model_dir))
                (eval_data
                 | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults(  # pylint: disable=expression-not-assigned
                     eval_shared_model=blessed_model,
                     slice_spec=slice_spec,
                     output_path=blessed_model_eval_result_path))

        absl.logging.info('all files in current_model_eval_result_path: [%s]',
                          str(fileio.listdir(current_model_eval_result_path)))
        current_model_eval_result = tfma.load_eval_result(
            output_path=current_model_eval_result_path)

        if not self._pass_threshold(current_model_eval_result):
            absl.logging.info('Current model does not pass threshold.')
            return False
        absl.logging.info('Current model passes threshold.')

        if blessed_model_dir is None:
            absl.logging.info('No blessed model yet.')
            return True
        absl.logging.info('all files in blessed_model_eval_result: [%s]',
                          str(fileio.listdir(blessed_model_eval_result_path)))
        blessed_model_eval_result = tfma.load_eval_result(
            output_path=blessed_model_eval_result_path)

        if (self._compare_eval_result(current_model_eval_result,
                                      blessed_model_eval_result)):
            absl.logging.info('Current model better than blessed model.')
            return True
        else:
            absl.logging.info('Current model worse than blessed model.')
            return False
Ejemplo n.º 5
0
  def setUp(self):
    super().setUp()

    # List of packages installed.
    self._pip_list = pip_utils.get_package_names()

    # Check if Apache Airflow is installed before running E2E tests.
    if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
      sys.exit('Apache Airflow not installed.')

    # Change the encoding for Click since Python 3 is configured to use ASCII as
    # encoding for the environment.
    if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
      os.environ['LANG'] = 'en_US.utf-8'

    # Setup airflow_home in a temp directory
    self._airflow_home = os.path.join(self.tmp_dir, 'airflow')
    self.enter_context(
        test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home))
    self.enter_context(
        test_case_utils.override_env_var('HOME', self._airflow_home))

    absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                      self._airflow_home)

    # Testdata path.
    self._testdata_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    self._pipeline_name = 'chicago_taxi_simple'
    self._pipeline_path = os.path.join(self._testdata_dir,
                                       'test_pipeline_airflow_1.py')

    # Copy data.
    chicago_taxi_pipeline_dir = os.path.join(
        os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))))),
        'examples', 'chicago_taxi_pipeline')
    data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
    content = fileio.listdir(data_dir)
    assert content, 'content in {} is empty'.format(data_dir)
    target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple')
    io_utils.copy_dir(data_dir, target_data_dir)
    assert fileio.isdir(target_data_dir)
    content = fileio.listdir(target_data_dir)
    assert content, 'content in {} is {}'.format(target_data_dir, content)
    io_utils.copy_file(
        os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
        os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

    # Initialize CLI runner.
    self.runner = click_testing.CliRunner()
Ejemplo n.º 6
0
  def _assertNumberOfTrainerOutputIsOne(self, pipeline_name):
    """Make sure the number of trainer executions and output models."""
    # There must be only one execution of Trainer.
    trainer_output_base_dir = os.path.join(
        self._pipeline_root(pipeline_name), 'Trainer', 'model')
    trainer_outputs = fileio.listdir(trainer_output_base_dir)
    self.assertEqual(1, len(trainer_outputs))

    # There must be only one saved models each for serving and eval.
    model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0])
    eval_model_dir = path_utils.eval_model_dir(model_uri)
    serving_model_dir = path_utils.serving_model_dir(model_uri)
    self.assertEqual(1, fileio.listdir(eval_model_dir).count('saved_model.pb'))
    self.assertEqual(1,
                     fileio.listdir(serving_model_dir).count('saved_model.pb'))
Ejemplo n.º 7
0
    def testRecordBeamPipelineRunId(self, mock_metadata, mock_config):
        # Tests recording Beam pipeline outputs given a run_id.
        with mock.patch.object(pipeline_recorder_utils, '_get_execution_dict',
                               return_value=self.execution_dict
                               ) as mock_get_execution_dict,\
            mock.patch.object(pipeline_recorder_utils, '_get_paths',
                              return_value=self.paths
                              ) as mock_get_paths:
            pipeline_recorder_utils.record_pipeline(
                output_dir=self._base_dir,
                metadata_db_uri=self.metadata_db_uri,
                run_id=self.run_id)

            mock_config.assert_called_with(self.metadata_db_uri)
            mock_metadata.assert_called()
            mock_get_execution_dict.assert_called()
            mock_get_paths.assert_called()

            # Verifying that test.txt has been copied from src_uri to dest_uri
            files = fileio.listdir(self.dest_uri)
            self.assertLen(files, 1)
            self.assertEqual(
                io_utils.read_string_file(os.path.join(self.dest_uri,
                                                       files[0])),
                self.content)
Ejemplo n.º 8
0
  def testDo(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    statistics_artifact = standard_artifacts.ExampleStatistics()
    statistics_artifact.uri = os.path.join(source_data_dir, 'statistics_gen')
    statistics_artifact.split_names = artifact_utils.encode_split_names(
        ['train', 'eval', 'test'])

    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)

    schema_output = standard_artifacts.Schema()
    schema_output.uri = os.path.join(output_data_dir, 'schema_output')

    input_dict = {
        standard_component_specs.STATISTICS_KEY: [statistics_artifact],
    }

    exec_properties = {
        # List needs to be serialized before being passed into Do function.
        standard_component_specs.EXCLUDE_SPLITS_KEY:
            json_utils.dumps(['test'])
    }

    output_dict = {
        standard_component_specs.SCHEMA_KEY: [schema_output],
    }

    schema_gen_executor = executor.Executor()
    schema_gen_executor.Do(input_dict, output_dict, exec_properties)
    self.assertNotEqual(0, len(fileio.listdir(schema_output.uri)))
Ejemplo n.º 9
0
def clear_output_dirs(output_dict: Dict[str, List[types.Artifact]]) -> None:
    """Clear dirs of output artifacts' URI."""
    for _, artifact_list in output_dict.items():
        for artifact in artifact_list:
            if fileio.isdir(artifact.uri) and fileio.listdir(artifact.uri):
                fileio.rmtree(artifact.uri)
                fileio.mkdir(artifact.uri)
Ejemplo n.º 10
0
 def _cleanup_kfp_server(self):
   pipelines = fileio.listdir(self._kubeflow_home)
   for pipeline_name in pipelines:
     if fileio.isdir(pipeline_name):
       self._delete_experiment(pipeline_name)
       self._delete_pipeline(pipeline_name)
       self._delete_pipeline_output(pipeline_name)
       self._delete_pipeline_metadata(pipeline_name)
 def assertExecutedOnce(self, component: Text) -> None:
   """Check the component is executed exactly once."""
   component_path = os.path.join(self._pipeline_root, component)
   self.assertTrue(fileio.exists(component_path))
   execution_path = os.path.join(
       component_path, '.system', 'executor_execution')
   execution = fileio.listdir(execution_path)
   self.assertLen(execution, 1)
Ejemplo n.º 12
0
def get_only_uri_in_dir(dir_path: Text) -> Text:
    """Gets the only uri from given directory."""

    files = fileio.listdir(dir_path)
    if len(files) != 1:
        raise RuntimeError(
            'Only one file per dir is supported: {}.'.format(dir_path))
    filename = os.path.dirname(os.path.join(files[0], ''))
    return os.path.join(dir_path, filename)
Ejemplo n.º 13
0
    def assertExecutedOnce(self, component: Text) -> None:
        """Check the component is executed exactly once."""
        component_path = os.path.join(self._pipeline_root, component)
        self.assertTrue(fileio.exists(component_path))
        outputs = fileio.listdir(component_path)

        self.assertIn('.system', outputs)
        outputs.remove('.system')
        system_paths = [
            os.path.join('.system', path)
            for path in fileio.listdir(os.path.join(component_path, '.system'))
        ]
        self.assertNotEmpty(system_paths)
        self.assertIn('.system/executor_execution', system_paths)
        outputs.extend(system_paths)
        for output in outputs:
            execution = fileio.listdir(os.path.join(component_path, output))
            self.assertLen(execution, 1)
Ejemplo n.º 14
0
 def extractDirectorySpec(self, path):
   if fileio.isdir(path):
     result = {}
     for name in fileio.listdir(path):
       result[name] = self.extractDirectorySpec(os.path.join(path, name))
     return result
   elif fileio.exists(path):
     return file_io.FileIO(path, mode='r').read()
   else:
     raise ValueError(f'{path} does not exist.')
Ejemplo n.º 15
0
    def setUp(self):
        super(CliLocalEndToEndTest, self).setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup local_home in a temp directory
        self._home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._home
        self._old_local_home = os.environ.get('LOCAL_HOME')
        os.environ['LOCAL_HOME'] = os.path.join(self._home, 'local', '')
        self._local_home = os.environ['LOCAL_HOME']

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
 def assertInfraValidatorPassed(self) -> None:
   infra_validator_path = os.path.join(self._pipeline_root, 'InfraValidator')
   blessing_path = os.path.join(self._pipeline_root, 'InfraValidator',
                                'blessing')
   executions = fileio.listdir(blessing_path)
   self.assertGreaterEqual(len(executions), 1)
   for exec_id in executions:
     blessing_uri = base_driver._generate_output_uri(  # pylint: disable=protected-access
         infra_validator_path, 'blessing', exec_id)
     blessed = os.path.join(blessing_uri, 'INFRA_BLESSED')
     self.assertTrue(fileio.exists(blessed))
Ejemplo n.º 17
0
    def list_pipelines(self) -> None:
        """List all the pipelines in the environment."""
        if not fileio.exists(self._handler_home_dir):
            click.echo('No pipelines to display.')
            return
        pipelines_list = fileio.listdir(self._handler_home_dir)

        # Print every pipeline name in a new line.
        click.echo('-' * 30)
        click.echo('\n'.join(pipelines_list))
        click.echo('-' * 30)
Ejemplo n.º 18
0
    def setUp(self):
        super().setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup beam_home in a temp directory
        self._home = self.tmp_dir
        self._beam_home = os.path.join(self._home, 'beam')
        self.enter_context(
            test_case_utils.override_env_var('BEAM_HOME', self._beam_home))
        self.enter_context(test_case_utils.override_env_var(
            'HOME', self._home))

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Ejemplo n.º 19
0
def build_ephemeral_package() -> Text:
    """Repackage current installation of TFX into a tfx_ephemeral sdist.

  Returns:
    Path to ephemeral sdist package.
  Raises:
    RuntimeError: if dist directory has zero or multiple files.
  """
    tmp_dir = os.path.join(tempfile.mkdtemp(), 'build', 'tfx')
    # Find the last directory named 'tfx' in this file's path and package it.
    path_split = __file__.split(os.path.sep)
    last_index = -1
    for i in range(len(path_split)):
        if path_split[i] == 'tfx':
            last_index = i
    if last_index < 0:
        raise RuntimeError('Cannot locate directory \'tfx\' in the path %s' %
                           __file__)
    tfx_root_dir = os.path.sep.join(path_split[0:last_index + 1])
    absl.logging.info('Copying all content from install dir %s to temp dir %s',
                      tfx_root_dir, tmp_dir)
    shutil.copytree(tfx_root_dir, os.path.join(tmp_dir, 'tfx'))
    # Source directory default permission is 0555 but we need to be able to create
    # new setup.py file.
    os.chmod(tmp_dir, 0o720)
    setup_file = os.path.join(tmp_dir, 'setup.py')
    absl.logging.info('Generating a temp setup file at %s', setup_file)
    install_requires = dependencies.make_required_install_packages()
    io_utils.write_string_file(
        setup_file,
        _ephemeral_setup_file.format(version=version.__version__,
                                     install_requires=install_requires))

    # Create the package
    curdir = os.getcwd()
    os.chdir(tmp_dir)
    temp_log = os.path.join(tmp_dir, 'setup.log')
    with open(temp_log, 'w') as f:
        absl.logging.info(
            'Creating temporary sdist package, logs available at %s', temp_log)
        cmd = [sys.executable, setup_file, 'sdist']
        subprocess.call(cmd, stdout=f, stderr=f)
    os.chdir(curdir)

    # Return the package dir+filename
    dist_dir = os.path.join(tmp_dir, 'dist')
    files = fileio.listdir(dist_dir)
    if not files:
        raise RuntimeError('Found no package files in %s' % dist_dir)
    elif len(files) > 1:
        raise RuntimeError('Found multiple package files in %s' % dist_dir)

    return os.path.join(dist_dir, files[0])
Ejemplo n.º 20
0
    def assertPushed(self):
        self.assertGreater(self._GetNumberOfFiles(self._serving_model_dir), 0)
        pushed_path = os.path.join(self._serving_model_dir,
                                   fileio.listdir(self._serving_model_dir)[0])
        self.assertGreater(self._GetNumberOfFiles(pushed_path), 0)
        model_path = self._executor.GetModelPath(self._input_dict)
        self.assertEqual(self._GetNumberOfFiles(pushed_path),
                         self._GetNumberOfFiles(model_path))
        self.assertEqual(self._GetNumberOfFiles(self._model_push.uri),
                         self._GetNumberOfFiles(model_path))

        self.assertEqual(1, self._model_push.get_int_custom_property('pushed'))
Ejemplo n.º 21
0
  def _assertHyperparametersAreWritten(self, pipeline_name):
    """Make sure the tuner execution and hyperpearameters output."""
    # There must be only one execution of Tuner.
    tuner_output_base_dir = os.path.join(
        self._pipeline_root(pipeline_name), 'Tuner', 'best_hyperparameters')
    tuner_outputs = fileio.listdir(tuner_output_base_dir)
    self.assertEqual(1, len(tuner_outputs))

    # There must be only one best hyperparameters.
    best_hyperparameters_uri = os.path.join(tuner_output_base_dir,
                                            tuner_outputs[0])
    self.assertTrue(fileio.exists(best_hyperparameters_uri))
Ejemplo n.º 22
0
    def list_pipelines(self) -> None:
        """List all the pipelines in the environment."""
        # There is no managed storage for pipeline packages, so CLI consults
        # local dir to list pipelines.
        if not fileio.exists(self._handler_home_dir):
            click.echo('No pipelines to display.')
            return
        pipelines_list = fileio.listdir(self._handler_home_dir)

        # Print every pipeline name in a new line.
        click.echo('-' * 30)
        click.echo('\n'.join(pipelines_list))
        click.echo('-' * 30)
Ejemplo n.º 23
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Copy the input_data to the output_data.

    For this example that is all that the Executor does.  For a different
    custom component, this is where the real functionality of the component
    would be included.

    This component both reads and writes Examples, but a different component
    might read and write artifacts of other types.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of type `standard_artifacts.Examples` which will
          often contain two splits, 'train' and 'eval'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output_data: A list of type `standard_artifacts.Examples` which will
          usually contain the same splits as input_data.
      exec_properties: A dict of execution properties, including:
        - name: Optional unique name. Necessary iff multiple Hello components
          are declared in the same pipeline.

    Returns:
      None

    Raises:
      OSError and its subclasses
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    input_artifact = artifact_utils.get_single_instance(
        input_dict['input_data'])
    output_artifact = artifact_utils.get_single_instance(
        output_dict['output_data'])
    output_artifact.split_names = input_artifact.split_names

    split_to_instance = {}

    for split in json.loads(input_artifact.split_names):
      uri = artifact_utils.get_split_uri([input_artifact], split)
      split_to_instance[split] = uri

    for split, instance in split_to_instance.items():
      input_dir = instance
      output_dir = artifact_utils.get_split_uri([output_artifact], split)
      for filename in fileio.listdir(input_dir):
        input_uri = os.path.join(input_dir, filename)
        output_uri = os.path.join(output_dir, filename)
        io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
Ejemplo n.º 24
0
  def testRecordLatestKfpPipeline(self, mock_get_latest_executions):
    # Tests recording KFP pipeline outputs for the latest execution.
    with mock.patch.object(
        pipeline_recorder_utils, '_get_paths',
        return_value=self.paths) as mock_get_paths:
      pipeline_recorder_utils.record_pipeline(
          output_dir=self._base_dir,
          host=self.host,
          port=self.port,
          pipeline_name=self.pipeline_name)
      mock_get_paths.assert_called()
      mock_get_latest_executions.assert_called()

      files = fileio.listdir(self.dest_uri)
      self.assertLen(files, 1)
      self.assertEqual(
          io_utils.read_string_file(os.path.join(self.dest_uri, files[0])),
          self.content)
Ejemplo n.º 25
0
    def testMakeClearAndRemoveOutputDirs(self):
        output_artifacts = self._output_resolver().generate_output_artifacts(1)
        outputs_utils.make_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                if isinstance(artifact, ValueArtifact):
                    self.assertFalse(fileio.isdir(artifact.uri))
                else:
                    self.assertTrue(fileio.isdir(artifact.uri))
                    with fileio.open(os.path.join(artifact.uri, 'output'),
                                     'w') as f:
                        f.write('')
                self.assertTrue(fileio.exists(artifact.uri))

        outputs_utils.clear_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                if not isinstance(artifact, ValueArtifact):
                    self.assertEqual(fileio.listdir(artifact.uri), [])

        outputs_utils.remove_output_dirs(output_artifacts)
        for _, artifact_list in output_artifacts.items():
            for artifact in artifact_list:
                self.assertFalse(fileio.exists(artifact.uri))
Ejemplo n.º 26
0
 def assertDirectoryNotEmpty(self, path):
     self.assertGreater(len(fileio.listdir(path)), 0)
    def setUp(self):
        super(AirflowEndToEndTest, self).setUp()
        # setup airflow_home in a temp directory, config and init db.
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(airflow_test_utils.delete_mysql_container,
                        self._mysql_container_name)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)

        # Set a couple of important environment variables. See
        # https://airflow.apache.org/howto/set-config.html for details.
        os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join(
            self._airflow_home, 'dags')
        os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join(
            self._airflow_home, 'logs')
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'

        # Following fields are specific to the chicago_taxi_simple example.
        self._dag_id = 'chicago_taxi_simple'
        self._run_id = 'manual_run_id_1'
        # This execution date must be after the start_date in chicago_taxi_simple
        # but before current execution date.
        self._execution_date = '2019-02-01T01:01:01'
        self._all_tasks = [
            'CsvExampleGen',
            'Evaluator',
            'ExampleValidator',
            'Pusher',
            'SchemaGen',
            'StatisticsGen',
            'Trainer',
            'Transform',
        ]
        # Copy dag file and data.
        chicago_taxi_pipeline_dir = os.path.dirname(__file__)
        simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir,
                                            'taxi_pipeline_simple.py')

        io_utils.copy_file(
            simple_pipeline_file,
            os.path.join(self._airflow_home, 'dags',
                         'taxi_pipeline_simple.py'))

        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        subprocess.run(['airflow', 'initdb'], check=True)
        subprocess.run(['airflow', 'unpause', self._dag_id], check=True)
Ejemplo n.º 28
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # List of packages installed.
        self._pip_list = str(
            subprocess.check_output(['pip', 'freeze', '--local']))

        # Check if Apache Airflow is installed before running E2E tests.
        if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
            sys.exit('Apache Airflow not installed.')

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName, 'airflow')
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        self._pipeline_name = 'chicago_taxi_simple'
        self._pipeline_path = os.path.join(self._testdata_dir,
                                           'test_pipeline_airflow_1.py')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(self._cleanup_mysql_container)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'

        self._airflow_initdb()

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Ejemplo n.º 29
0
    def _verify_transform_outputs(self,
                                  materialize=True,
                                  store_cache=True,
                                  multiple_example_inputs=False,
                                  compute_statistics=False):
        expected_outputs = ['transformed_graph']

        if store_cache:
            expected_outputs.append('CACHE')
            self.assertNotEqual(
                0,
                len(fileio.listdir(self._updated_analyzer_cache_artifact.uri)))

        example_artifacts = self._example_artifacts[:1]
        transformed_example_artifacts = self._transformed_example_artifacts[:1]
        if multiple_example_inputs:
            example_artifacts = self._example_artifacts
            transformed_example_artifacts = self._transformed_example_artifacts

        if materialize:
            expected_outputs.append('transformed_examples')

            assert len(example_artifacts) == len(transformed_example_artifacts)
            for example, transformed_example in zip(
                    example_artifacts, transformed_example_artifacts):
                examples_train_files = fileio.glob(
                    os.path.join(example.uri, 'Split-train', '*'))
                transformed_train_files = fileio.glob(
                    os.path.join(transformed_example.uri, 'Split-train', '*'))
                self.assertGreater(len(transformed_train_files), 0)

                examples_eval_files = fileio.glob(
                    os.path.join(example.uri, 'Split-eval', '*'))
                transformed_eval_files = fileio.glob(
                    os.path.join(transformed_example.uri, 'Split-eval', '*'))
                self.assertGreater(len(transformed_eval_files), 0)

                # Construct datasets and count number of records in each split.
                examples_train_count = _get_dataset_size(examples_train_files)
                transformed_train_count = _get_dataset_size(
                    transformed_train_files)
                examples_eval_count = _get_dataset_size(examples_eval_files)
                transformed_eval_count = _get_dataset_size(
                    transformed_eval_files)

                # Check for each split that it contains the same number of records in
                # the input artifact as in the output artifact (i.e 1-to-1 mapping is
                # preserved).
                self.assertEqual(examples_train_count, transformed_train_count)
                self.assertEqual(examples_eval_count, transformed_eval_count)
                self.assertGreater(transformed_train_count,
                                   transformed_eval_count)

        path_to_pre_transform_statistics = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH)
        path_to_post_transform_statistics = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH)
        if compute_statistics:
            self.assertTrue(fileio.exists(path_to_pre_transform_statistics))
            self.assertTrue(fileio.exists(path_to_post_transform_statistics))
        else:
            self.assertFalse(fileio.exists(path_to_pre_transform_statistics))
            self.assertFalse(fileio.exists(path_to_post_transform_statistics))

        # Depending on `materialize` and `store_cache`, check that
        # expected outputs are exactly correct. If either flag is False, its
        # respective output should not be present.
        self.assertCountEqual(expected_outputs,
                              fileio.listdir(self._output_data_dir))

        path_to_saved_model = os.path.join(
            self._transformed_output.uri,
            tft.TFTransformOutput.TRANSFORM_FN_DIR,
            tf.saved_model.SAVED_MODEL_FILENAME_PB)
        self.assertTrue(fileio.exists(path_to_saved_model))
Ejemplo n.º 30
0
 def assertDirectoryEmpty(self, path):
     self.assertEqual(len(fileio.listdir(path)), 0)