Ejemplo n.º 1
0
    def testUpdatePipeline(self):
        # First create pipeline with test_pipeline.py
        pipeline_path_1 = os.path.join(self.chicago_taxi_pipeline_dir,
                                       'test_pipeline_airflow_1.py')
        flags_dict_1 = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: pipeline_path_1
        }
        handler = airflow_handler.AirflowHandler(flags_dict_1)
        handler.create_pipeline()

        # Update test_pipeline and run update_pipeline
        pipeline_path_2 = os.path.join(self._tmp_dir,
                                       'test_pipeline_airflow_2.py')
        io_utils.copy_file(pipeline_path_1, pipeline_path_2)
        flags_dict_2 = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: pipeline_path_2
        }
        handler = airflow_handler.AirflowHandler(flags_dict_2)
        handler.update_pipeline()
        handler_pipeline_path = os.path.join(
            handler._handler_home_dir,
            self.pipeline_args[labels.PIPELINE_NAME], '')
        self.assertTrue(
            tf.io.gfile.exists(
                os.path.join(handler_pipeline_path,
                             'test_pipeline_airflow_2.py')))
        self.assertTrue(
            tf.io.gfile.exists(
                os.path.join(handler_pipeline_path, 'pipeline_args.json')))
Ejemplo n.º 2
0
  def _save_pipeline(self, pipeline_args: Dict[str, Any]) -> None:
    """Creates/updates pipeline folder in the handler directory.

    Args:
      pipeline_args: Pipeline details obtained from DSL.
    """
    pipeline_name = pipeline_args[labels.PIPELINE_NAME]
    handler_pipeline_path = self._get_pipeline_info_path(pipeline_name)

    # If updating pipeline, first delete pipeline directory.
    if fileio.exists(handler_pipeline_path):
      io_utils.delete_dir(handler_pipeline_path)

    # Dump pipeline_args to handler pipeline folder as json.
    fileio.makedirs(handler_pipeline_path)
    with open(os.path.join(
        handler_pipeline_path, 'pipeline_args.json'), 'w') as f:
      json.dump(pipeline_args, f)

    # Copy dsl to pipeline folder
    pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
    io_utils.copy_file(
        pipeline_dsl_path,
        os.path.join(handler_pipeline_path,
                     os.path.basename(pipeline_dsl_path)))
Ejemplo n.º 3
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """ImportSchemaGen executor entrypoint.

    This generate Schema artifact with given schema_file.

    Args:
      input_dict: Should be empty.
      output_dict: Output dict from key to a list of artifacts, including:
        - schema: A list of 'Schema' artifact of size one.
      exec_properties: A dict of execution properties, includes:
        - schema_file: Source schema file path.

    Returns:
      None
    """
    source_file_path = exec_properties.get(
        standard_component_specs.SCHEMA_FILE_KEY)
    if not source_file_path:
      raise ValueError('Schema file path is missing in exec_properties.')
    output_uri = os.path.join(
        artifact_utils.get_single_uri(
            output_dict[standard_component_specs.SCHEMA_KEY]),
        schema_gen_executor.DEFAULT_FILE_NAME)

    # Check whether the input file has a proper schema proto.
    _ = io_utils.SchemaReader().read(source_file_path)

    io_utils.copy_file(source_file_path, output_uri)
    logging.info('Copied a schema file from %s to %s.', source_file_path,
                 output_uri)
Ejemplo n.º 4
0
def copy_template(flags_dict: Dict[Text, Any]) -> None:
    """Copy template flags_dict["model"] to flags_dict["dest_dir"].

  Copies all *.py and README files in specified template, and replace
  the content of the files.

  Args:
    flags_dict: Should have pipeline_name, model and dest_dir.
  """
    pipeline_name = _sanitize_pipeline_name(flags_dict[labels.PIPELINE_NAME])
    template_dir = os.path.join(_templates_src_dir(), flags_dict[labels.MODEL])
    destination_dir = flags_dict[labels.DESTINATION_PATH]
    if not os.path.isdir(template_dir):
        raise ValueError('Model {} does not exist.'.format(
            flags_dict[labels.MODEL]))

    replace_dict = {
        _IMPORT_FROM_PACKAGE: _IMPORT_FROM_LOCAL_DIR,
        _PLACEHOLDER_PIPELINE_NAME: pipeline_name,
        _INTERNAL_TODO_PREFIX: '',
    }
    _copy_and_replace_placeholder_dir(template_dir, destination_dir,
                                      replace_dict)
    for additional_file in _ADDITIONAL_FILE_PATHS[flags_dict[labels.MODEL]]:
        src_path = os.path.join(_tfx_src_dir(), additional_file.src)
        dst_path = os.path.join(destination_dir, additional_file.dst)
        io_utils.copy_file(src_path, dst_path)
Ejemplo n.º 5
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory.

    Args:
      pipeline_args: Pipeline details obtained from DSL.
    """
        # Path to pipeline folder in Airflow.
        handler_pipeline_path = os.path.join(
            self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '')

        # If updating pipeline, first delete pipeline directory.
        if tf.io.gfile.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        # Dump pipeline_args to handler pipeline folder as json.
        tf.io.gfile.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'),
                  'w') as f:
            json.dump(pipeline_args, f)

        # Copy dsl to pipeline folder
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
        io_utils.copy_file(
            pipeline_dsl_path,
            os.path.join(handler_pipeline_path,
                         os.path.basename(pipeline_dsl_path)))
Ejemplo n.º 6
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                        self._airflow_home)

        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'
        # Using more than one thread results in a warning for sqlite backend.
        # See https://github.com/tensorflow/tfx/issues/141
        os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1'

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = tf.gfile.ListDirectory(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert tf.gfile.IsDirectory(target_data_dir)
        content = tf.gfile.ListDirectory(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        _ = subprocess.check_output(['airflow', 'initdb'])

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Ejemplo n.º 7
0
    def _read_schema_from_pipeline_root(self, pipeline_name, pipeline_root):
        # Check if pipeline root created. If not, it means that the user has not
        # created a run yet or the pipeline is still running for the first time.

        if not fileio.exists(pipeline_root):
            sys.exit(
                'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.'
            )

        # If pipeline_root exists, then check if SchemaGen output exists.
        components = fileio.listdir(pipeline_root)
        if 'SchemaGen' not in components:
            sys.exit(
                'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.'
            )

        # Get the latest SchemaGen output.
        component_output_dir = os.path.join(pipeline_root, 'SchemaGen')
        schema_dir = os.path.join(component_output_dir, 'schema')
        schemagen_outputs = fileio.listdir(schema_dir)
        latest_schema_folder = max(schemagen_outputs, key=int)

        # Copy schema to current dir.
        latest_schema_uri = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', int(latest_schema_folder))
        latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt')
        curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt')
        io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True)

        # Print schema and path to schema
        click.echo('Path to schema: {}'.format(curr_dir_path))
        click.echo('*********SCHEMA FOR {}**********'.format(
            pipeline_name.upper()))
        with open(curr_dir_path, 'r') as f:
            click.echo(f.read())
Ejemplo n.º 8
0
  def _save_pipeline(self, pipeline_args) -> None:
    """Creates/updates pipeline folder in the handler directory."""

    # Path to pipeline folder in airflow.
    handler_pipeline_path = self._get_handler_pipeline_path(
        pipeline_args[labels.PIPELINE_NAME])

    # If updating pipeline, first delete pipeline directory.
    if tf.io.gfile.exists(handler_pipeline_path):
      io_utils.delete_dir(handler_pipeline_path)

    # Dump pipeline_args to handler pipeline folder as json.
    tf.io.gfile.makedirs(handler_pipeline_path)
    with open(os.path.join(
        handler_pipeline_path, 'pipeline_args.json'), 'w') as f:
      json.dump(pipeline_args, f)

    # Copy dsl to pipeline folder
    io_utils.copy_file(
        self.flags_dict[labels.PIPELINE_DSL_PATH],
        os.path.join(
            handler_pipeline_path,
            os.path.basename(self.flags_dict[labels.PIPELINE_DSL_PATH])
            )
        )
Ejemplo n.º 9
0
 def upload_pipeline(self, pipeline_package_path, pipeline_name):  # pylint: disable=invalid-name, unused-argument
     io_utils.copy_file(pipeline_package_path,
                        os.path.join(
                            self._output_dir,
                            os.path.basename(pipeline_package_path)),
                        overwrite=True)
     return _MockUploadResponse(self.config)
Ejemplo n.º 10
0
 def testCopyFile(self):
   file_path = os.path.join(self._base_dir, 'temp_file')
   io_utils.write_string_file(file_path, 'testing')
   copy_path = os.path.join(self._base_dir, 'copy_file')
   io_utils.copy_file(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   f = file_io.FileIO(file_path, mode='r')
   self.assertEqual('testing', f.read())
   self.assertEqual(7, f.tell())
Ejemplo n.º 11
0
 def testCopyFile(self):
     file_path = os.path.join(self._base_dir, 'temp_file')
     io_utils.write_string_file(file_path, 'testing')
     copy_path = os.path.join(self._base_dir, 'copy_file')
     io_utils.copy_file(file_path, copy_path)
     self.assertTrue(file_io.file_exists(copy_path))
     f = file_io.FileIO(file_path, mode='r')
     self.assertEqual('testing', f.read())
     self.assertEqual(7, f.tell())
Ejemplo n.º 12
0
 def testCopyFile(self):
   self.createFiles({
       'file1.txt': 'testing'
   })
   io_utils.copy_file(self.relpath('file1.txt'), self.relpath('file2.txt'))
   self.assertDirectoryEqual(self._base_dir, {
       'file1.txt': 'testing',
       'file2.txt': 'testing'
   })
Ejemplo n.º 13
0
  def setUp(self):
    super(CliAirflowEndToEndTest, self).setUp()

    # List of packages installed.
    self._pip_list = str(subprocess.check_output(['pip', 'freeze', '--local']))

    # Check if Apache Airflow is installed before running E2E tests.
    if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
      sys.exit('Apache Airflow not installed.')

    # Change the encoding for Click since Python 3 is configured to use ASCII as
    # encoding for the environment.
    if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
      os.environ['LANG'] = 'en_US.utf-8'

    # Setup airflow_home in a temp directory
    self._airflow_home = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
        self._testMethodName, 'airflow')
    self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
    os.environ['AIRFLOW_HOME'] = self._airflow_home
    self._old_home = os.environ.get('HOME')
    os.environ['HOME'] = self._airflow_home
    absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                      self._airflow_home)

    # Testdata path.
    self._testdata_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    # Do not load examples to make this a bit faster.
    os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'

    # Copy data.
    chicago_taxi_pipeline_dir = os.path.join(
        os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))))),
        'examples', 'chicago_taxi_pipeline', '')
    data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
    content = tf.io.gfile.listdir(data_dir)
    assert content, 'content in {} is empty'.format(data_dir)
    target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple')
    io_utils.copy_dir(data_dir, target_data_dir)
    assert tf.io.gfile.isdir(target_data_dir)
    content = tf.io.gfile.listdir(target_data_dir)
    assert content, 'content in {} is {}'.format(target_data_dir, content)
    io_utils.copy_file(
        os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
        os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

    self._airflow_initdb()

    # Initialize CLI runner.
    self.runner = click_testing.CliRunner()
Ejemplo n.º 14
0
  def setUp(self):
    super().setUp()

    # List of packages installed.
    self._pip_list = pip_utils.get_package_names()

    # Check if Apache Airflow is installed before running E2E tests.
    if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
      sys.exit('Apache Airflow not installed.')

    # Change the encoding for Click since Python 3 is configured to use ASCII as
    # encoding for the environment.
    if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
      os.environ['LANG'] = 'en_US.utf-8'

    # Setup airflow_home in a temp directory
    self._airflow_home = os.path.join(self.tmp_dir, 'airflow')
    self.enter_context(
        test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home))
    self.enter_context(
        test_case_utils.override_env_var('HOME', self._airflow_home))

    absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                      self._airflow_home)

    # Testdata path.
    self._testdata_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')

    self._pipeline_name = 'chicago_taxi_simple'
    self._pipeline_path = os.path.join(self._testdata_dir,
                                       'test_pipeline_airflow_1.py')

    # Copy data.
    chicago_taxi_pipeline_dir = os.path.join(
        os.path.dirname(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))))),
        'examples', 'chicago_taxi_pipeline')
    data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
    content = fileio.listdir(data_dir)
    assert content, 'content in {} is empty'.format(data_dir)
    target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple')
    io_utils.copy_dir(data_dir, target_data_dir)
    assert fileio.isdir(target_data_dir)
    content = fileio.listdir(target_data_dir)
    assert content, 'content in {} is {}'.format(target_data_dir, content)
    io_utils.copy_file(
        os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
        os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

    # Initialize CLI runner.
    self.runner = click_testing.CliRunner()
Ejemplo n.º 15
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Copy the input_data to the output_data.

    For this example that is all that the Executor does.  For a different
    custom component, this is where the real functionality of the component
    would be included.

    This component both reads and writes Examples, but a different component
    might read and write artifacts of other types.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of type `standard_artifacts.Examples` which will
          often contain two splits, 'train' and 'eval'.
      output_dict: Output dict from key to a list of artifacts, including:
        - output_data: A list of type `standard_artifacts.Examples` which will
          usually contain the same splits as input_data.
      exec_properties: A dict of execution properties, including:
        - name: Optional unique name. Necessary iff multiple Hello components
          are declared in the same pipeline.

    Returns:
      None

    Raises:
      OSError and its subclasses
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        input_artifact = artifact_utils.get_single_instance(
            input_dict['input_data'])
        output_artifact = artifact_utils.get_single_instance(
            output_dict['output_data'])
        output_artifact.split_names = input_artifact.split_names

        split_to_instance = {}

        for split in json.loads(input_artifact.split_names):
            uri = artifact_utils.get_split_uri([input_artifact], split)
            split_to_instance[split] = uri

        for split, instance in split_to_instance.items():
            input_dir = instance
            output_dir = artifact_utils.get_split_uri([output_artifact], split)
            for filename in tf.io.gfile.listdir(input_dir):
                input_uri = os.path.join(input_dir, filename)
                output_uri = os.path.join(output_dir, filename)
                io_utils.copy_file(src=input_uri,
                                   dst=output_uri,
                                   overwrite=True)
Ejemplo n.º 16
0
    def get_schema(self):
        pipeline_name = self.flags_dict[labels.PIPELINE_NAME]

        # Check if pipeline exists.
        self._check_pipeline_existence(pipeline_name)

        # Path to pipeline args.
        pipeline_args_path = os.path.join(
            self._handler_home_dir, self.flags_dict[labels.PIPELINE_NAME],
            'pipeline_args.json')

        # Get pipeline_root.
        with open(pipeline_args_path, 'r') as f:
            pipeline_args = json.load(f)

        # Check if pipeline root created. If not, it means that the user has not
        # created a run yet or the pipeline is still running for the first time.
        pipeline_root = pipeline_args[labels.PIPELINE_ROOT]
        if not tf.io.gfile.exists(pipeline_root):
            sys.exit(
                'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.'
            )

        # If pipeline_root exists, then check if SchemaGen output exists.
        components = tf.io.gfile.listdir(pipeline_root)
        if 'SchemaGen' not in components:
            sys.exit(
                'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.'
            )

        # Get the latest SchemaGen output.
        component_output_dir = os.path.join(pipeline_root, 'SchemaGen')
        schema1_uri = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', 1)
        schema_dir = os.path.join(os.path.dirname(schema1_uri), '')
        schemagen_outputs = tf.io.gfile.listdir(schema_dir)
        latest_schema_folder = max(schemagen_outputs, key=int)

        # Copy schema to current dir.
        latest_schema_uri = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', latest_schema_folder)
        latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt')
        curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt')
        io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True)

        # Print schema and path to schema
        click.echo('Path to schema: {}'.format(curr_dir_path))
        click.echo('*********SCHEMA FOR {}**********'.format(
            pipeline_name.upper()))
        with open(curr_dir_path, 'r') as f:
            click.echo(f.read())
Ejemplo n.º 17
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                        self._airflow_home)

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = tf.gfile.ListDirectory(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert tf.gfile.IsDirectory(target_data_dir)
        content = tf.gfile.ListDirectory(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        _ = subprocess.check_output(['airflow', 'initdb'])

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()

        # Start scheduler.
        self._scheduler = subprocess.Popen(['airflow', 'scheduler'])
Ejemplo n.º 18
0
    def setUpClass(cls):
        super(ExecutorTest, cls).setUpClass()
        source_example_dir = os.path.join(cls._SOURCE_DATA_DIR,
                                          'csv_example_gen')

        io_utils.copy_dir(source_example_dir, cls._ARTIFACT1_URI)
        io_utils.copy_dir(source_example_dir, cls._ARTIFACT2_URI)

        # Duplicate the number of train and eval records such that
        # second artifact has twice as many as first.
        artifact2_pattern = os.path.join(cls._ARTIFACT2_URI, '*', '*')
        artifact2_files = tf.io.gfile.glob(artifact2_pattern)
        for filepath in artifact2_files:
            directory, filename = os.path.split(filepath)
            io_utils.copy_file(filepath,
                               os.path.join(directory, 'dup_' + filename))
Ejemplo n.º 19
0
    def setUp(self):
        super(CliBeamEndToEndTest, self).setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup beam_home in a temp directory
        self._home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._home
        self._old_beam_home = os.environ.get('BEAM_HOME')
        os.environ['BEAM_HOME'] = os.path.join(self._home, 'beam', '')
        self._beam_home = os.environ['BEAM_HOME']

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = tf.io.gfile.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert tf.io.gfile.isdir(target_data_dir)
        content = tf.io.gfile.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Ejemplo n.º 20
0
    def setUp(self):
        super().setUp()

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup beam_home in a temp directory
        self._home = self.tmp_dir
        self._beam_home = os.path.join(self._home, 'beam')
        self.enter_context(
            test_case_utils.override_env_var('BEAM_HOME', self._beam_home))
        self.enter_context(test_case_utils.override_env_var(
            'HOME', self._home))

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._home, 'taxi', 'taxi_utils.py'))

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
  def setUp(self):
    super().setUp()

    penguin_examples_dir = os.path.join(self._REPO_BASE, 'tfx', 'examples',
                                        'penguin')
    # The location of the penguin test data and schema. The input files are
    # copied to a test-local location for each invocation, and cleaned up at the
    # end of test.
    penguin_test_data_root = os.path.join(penguin_examples_dir, 'data')
    penguin_test_schema_file = os.path.join(penguin_examples_dir, 'schema',
                                            'user_provided', 'schema.pbtxt')

    # The location of the user module for penguin. Will be packaged and copied
    # to under the pipeline root before pipeline execution.
    self._penguin_dependency_file = os.path.join(
        penguin_examples_dir, 'penguin_utils_cloud_tuner.py')

    self._penguin_data_root = os.path.join(self._testdata_root, 'data')
    io_utils.copy_dir(penguin_test_data_root, self._penguin_data_root)
    self._penguin_schema_file = os.path.join(self._testdata_root,
                                             'schema.pbtxt')
    io_utils.copy_file(penguin_test_schema_file, self._penguin_schema_file)
Ejemplo n.º 22
0
def copy_over(input_artifact, output_artifact, splits_to_copy):
    """
  Copy data from specified splits
  Args:
    input_artifact: location where the input splits are
    output_artifact: location where to copy them
    splits_to_copy: list of split names to copy
  Returns:
    None
  """
    split_to_instance = {}

    for split in splits_to_copy:
        uri = artifact_utils.get_split_uri(input_artifact, split)
        split_to_instance[split] = uri

    for split, instance in split_to_instance.items():
        input_dir = instance
        output_dir = artifact_utils.get_split_uri([output_artifact], split)
        for filename in tf.io.gfile.listdir(input_dir):
            input_uri = os.path.join(input_dir, filename)
            output_uri = os.path.join(output_dir, filename)
            io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
Ejemplo n.º 23
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:
        """Push model to target directory if blessed.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model: exported model from trainer.
        - model_blessing: model blessing path from model_validator.  A push
          action delivers the model exports produced by Trainer to the
          destination defined in component config.
      output_dict: Output dict from key to a list of artifacts, including:
        - pushed_model: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: A dict of execution properties, including:
        - push_destination: JSON string of pusher_pb2.PushDestination instance,
          providing instruction of destination to push model.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        model_push = artifact_utils.get_single_instance(
            output_dict[standard_component_specs.PUSHED_MODEL_KEY])
        if not self.CheckBlessing(input_dict):
            self._MarkNotPushed(model_push)
            return
        model_path = self.GetModelPath(input_dict)

        # Push model to the destination, which can be listened by a model server.
        #
        # If model is already successfully copied to outside before, stop copying.
        # This is because model validator might blessed same model twice (check
        # mv driver) with different blessing output, we still want Pusher to
        # handle the mv output again to keep metadata tracking, but no need to
        # copy to outside path again..
        # TODO(jyzhao): support rpc push and verification.
        push_destination = pusher_pb2.PushDestination()
        proto_utils.json_to_proto(
            exec_properties[standard_component_specs.PUSH_DESTINATION_KEY],
            push_destination)

        destination_kind = push_destination.WhichOneof('destination')
        if destination_kind == 'filesystem':
            fs_config = push_destination.filesystem
            if fs_config.versioning == _Versioning.AUTO:
                fs_config.versioning = _Versioning.UNIX_TIMESTAMP
            if fs_config.versioning == _Versioning.UNIX_TIMESTAMP:
                model_version = str(int(time.time()))
            else:
                raise NotImplementedError('Invalid Versioning {}'.format(
                    fs_config.versioning))
            logging.info('Model version: %s', model_version)
            serving_path = os.path.join(fs_config.base_directory,
                                        model_version)

            if fileio.exists(serving_path):
                logging.info(
                    'Destination directory %s already exists, skipping current push.',
                    serving_path)
            else:
                # For TensorFlow SavedModel, saved_model.pb file should be the last file
                # to be copied as TF serving and other codes rely on that file as an
                # indication that the model is available.
                # https://github.com/tensorflow/tensorflow/blob/d5b3c79b4804134d0d17bfce9f312151f6337dba/tensorflow/python/saved_model/save.py#L1445
                io_utils.copy_dir(model_path,
                                  serving_path,
                                  deny_regex_patterns=[r'saved_model\.pb'])
                saved_model_path = os.path.join(model_path, 'saved_model.pb')
                if fileio.exists(saved_model_path):
                    io_utils.copy_file(
                        saved_model_path,
                        os.path.join(serving_path, 'saved_model.pb'),
                    )
                logging.info('Model written to serving path %s.', serving_path)
        else:
            raise NotImplementedError(
                'Invalid push destination {}'.format(destination_kind))

        # Copy the model to pushing uri for archiving.
        io_utils.copy_dir(model_path, model_push.uri)
        self._MarkPushed(model_push,
                         pushed_destination=serving_path,
                         pushed_version=model_version)
        logging.info('Model pushed to %s.', model_push.uri)
    def setUp(self):
        super(AirflowEndToEndTest, self).setUp()
        # setup airflow_home in a temp directory, config and init db.
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(airflow_test_utils.delete_mysql_container,
                        self._mysql_container_name)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)

        # Set a couple of important environment variables. See
        # https://airflow.apache.org/howto/set-config.html for details.
        os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join(
            self._airflow_home, 'dags')
        os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join(
            self._airflow_home, 'logs')
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'

        # Following fields are specific to the chicago_taxi_simple example.
        self._dag_id = 'chicago_taxi_simple'
        self._run_id = 'manual_run_id_1'
        # This execution date must be after the start_date in chicago_taxi_simple
        # but before current execution date.
        self._execution_date = '2019-02-01T01:01:01'
        self._all_tasks = [
            'CsvExampleGen',
            'Evaluator',
            'ExampleValidator',
            'Pusher',
            'SchemaGen',
            'StatisticsGen',
            'Trainer',
            'Transform',
        ]
        # Copy dag file and data.
        chicago_taxi_pipeline_dir = os.path.dirname(__file__)
        simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir,
                                            'taxi_pipeline_simple.py')

        io_utils.copy_file(
            simple_pipeline_file,
            os.path.join(self._airflow_home, 'dags',
                         'taxi_pipeline_simple.py'))

        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        subprocess.run(['airflow', 'initdb'], check=True)
        subprocess.run(['airflow', 'unpause', self._dag_id], check=True)
Ejemplo n.º 25
0
def start_cmle_training(input_dict,
                        output_dict,
                        exec_properties,
                        training_inputs):
  """Start a trainer job on CMLE."""
  training_inputs = training_inputs.copy()
  logger = logging_utils.get_logger(exec_properties['log_root'], 'exec')
  # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
  exec_properties['custom_config'].pop('cmle_training_args')

  json_inputs = types.jsonify_tfx_type_dict(input_dict)
  logger.info('json_inputs=\'%s\'.', json_inputs)
  json_outputs = types.jsonify_tfx_type_dict(output_dict)
  logger.info('json_outputs=\'%s\'.', json_outputs)
  json_exec_properties = json.dumps(exec_properties)
  logger.info('json_exec_properties=\'%s\'.', json_exec_properties)

  # Configure CMLE job
  api_client = discovery.build('ml', 'v1')
  job_args = [
      '--executor', 'Trainer', '--inputs', json_inputs, '--outputs',
      json_outputs, '--exec-properties', json_exec_properties
  ]
  training_inputs['args'] = job_args
  training_inputs['pythonModule'] = 'tfx.scripts.run_executor'

  # Pop project_id so CMLE doesn't complain about an unexpected parameter.
  # It's been a stowaway in cmle_args and has finally reached its destination.
  project = training_inputs.pop('project')
  project_id = 'projects/{}'.format(project)

  if 'packageUris' not in training_inputs:
    # Create TFX dist and add it to training_inputs
    local_package = io_utils.build_package()
    cloud_package = os.path.join(training_inputs['jobDir'],
                                 os.path.basename(local_package))
    io_utils.copy_file(local_package, cloud_package, True)
    training_inputs['packageUris'] = [cloud_package]

  job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
  job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

  # Submit job to CMLE
  logger.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
      job_name, project))
  request = api_client.projects().jobs().create(
      body=job_spec, parent=project_id)
  request.execute()

  # Wait for CMLE job to finish
  job_id = '{}/jobs/{}'.format(project_id, job_name)
  request = api_client.projects().jobs().get(name=job_id)
  response = request.execute()
  while response['state'] not in ('SUCCEEDED', 'FAILED'):
    time.sleep(_POLLING_INTERVAL_IN_SECONDS)
    response = request.execute()

  if response['state'] == 'FAILED':
    err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
        job_name, response)
    logger.error(err_msg)
    raise RuntimeError(err_msg)

  # CMLE training complete
  logger.info('Job \'{}\' successful.'.format(job_name))
Ejemplo n.º 26
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # List of packages installed.
        self._pip_list = str(
            subprocess.check_output(['pip', 'freeze', '--local']))

        # Check if Apache Airflow is installed before running E2E tests.
        if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
            sys.exit('Apache Airflow not installed.')

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName, 'airflow')
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Set a couple of important environment variables. See
        # https://airflow.apache.org/howto/set-config.html for details.
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '0'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'
        os.environ['AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL'] = '0'
        # Using more than one thread results in a warning for sqlite backend.
        # See https://github.com/tensorflow/tfx/issues/141
        os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1'

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline', '')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = tf.io.gfile.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert tf.io.gfile.isdir(target_data_dir)
        content = tf.io.gfile.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        _ = subprocess.check_output(['airflow', 'initdb'])

        # Start airflow scheduler.
        self._out = open(os.path.join(self._airflow_home, 'out.txt'), 'w+')
        self._err = open(os.path.join(self._airflow_home, 'err.txt'), 'w+')
        self._scheduler = subprocess.Popen(['airflow', 'scheduler'])

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
    def setUp(self):
        super(AirflowEndToEndTest, self).setUp()
        # setup airflow_home in a temp directory, config and init db.
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)
        # Set a couple of important environment variables. See
        # https://airflow.apache.org/howto/set-config.html for details.
        os.environ['AIRFLOW__CORE__AIRFLOW_HOME'] = self._airflow_home
        os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join(
            self._airflow_home, 'dags')
        os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join(
            self._airflow_home, 'logs')
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'sqlite:///%s/airflow.db' % self._airflow_home)
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'
        # Following environment variables make scheduler process dags faster.
        os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1'
        os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1'
        os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1'
        os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30'
        # Using more than one thread results in a warning for sqlite backend.
        # See https://github.com/tensorflow/tfx/issues/141
        os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1'

        # Following fields are specific to the chicago_taxi_simple example.
        self._dag_id = 'chicago_taxi_simple'
        self._run_id = 'manual_run_id_1'
        # This execution date must be after the start_date in chicago_taxi_simple
        # but before current execution date.
        self._execution_date = '2019-02-01T01:01:01+01:01'
        self._all_tasks = [
            'CsvExampleGen',
            'Evaluator',
            'ExampleValidator',
            'ModelValidator',
            'Pusher',
            'SchemaGen',
            'StatisticsGen',
            'Trainer',
            'Transform',
        ]
        # Copy dag file and data.
        chicago_taxi_pipeline_dir = os.path.dirname(__file__)
        simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir,
                                            'taxi_pipeline_simple.py')

        io_utils.copy_file(
            simple_pipeline_file,
            os.path.join(self._airflow_home, 'dags',
                         'taxi_pipeline_simple.py'))

        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = tf.io.gfile.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert tf.io.gfile.isdir(target_data_dir)
        content = tf.io.gfile.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        # Initialize database.
        _ = subprocess.check_output(['airflow', 'initdb'])
        _ = subprocess.check_output(['airflow', 'unpause', self._dag_id])
Ejemplo n.º 28
0
def start_cmle_training(input_dict: Dict[Text, List[types.TfxArtifact]],
                        output_dict: Dict[Text, List[types.TfxArtifact]],
                        exec_properties: Dict[Text, Any],
                        executor_class_path: Text, training_inputs: Dict[Text,
                                                                         Any]):
    """Start a trainer job on CMLE.

  This is done by forwarding the inputs/outputs/exec_properties to the
  tfx.scripts.run_executor module on a CMLE training job interpreter.

  Args:
    input_dict: Passthrough input dict for tfx.components.Trainer.executor.
    output_dict: Passthrough input dict for tfx.components.Trainer.executor.
    exec_properties: Passthrough input dict for tfx.components.Trainer.executor.
    executor_class_path: class path for TFX core default trainer.
    training_inputs: Training input for CMLE training job. 'pythonModule',
      'pythonVersion' and 'runtimeVersion' will be inferred by the runner. For
      the full set of parameters supported, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

  Returns:
    None
  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed.
  """
    training_inputs = training_inputs.copy()
    # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
    for gaip_training_key in ['cmle_training_args', 'gaip_training_args']:
        if gaip_training_key in exec_properties.get('custom_config'):
            exec_properties['custom_config'].pop(gaip_training_key)

    json_inputs = types.jsonify_tfx_type_dict(input_dict)
    tf.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = types.jsonify_tfx_type_dict(output_dict)
    tf.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties)
    tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure CMLE job
    api_client = discovery.build('ml', 'v1')
    job_args = [
        '--executor_class_path', executor_class_path, '--inputs', json_inputs,
        '--outputs', json_outputs, '--exec-properties', json_exec_properties
    ]
    training_inputs['args'] = job_args
    training_inputs['pythonModule'] = 'tfx.scripts.run_executor'
    training_inputs['pythonVersion'] = _get_caip_python_version()
    # runtimeVersion should be same as <major>.<minor> of currently
    # installed tensorflow version.
    training_inputs['runtimeVersion'] = _get_tf_runtime_version()

    # Pop project_id so CMLE doesn't complain about an unexpected parameter.
    # It's been a stowaway in cmle_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)

    package_uris = training_inputs.get('packageUris', [])
    if package_uris:
        tf.logging.info('Following packageUris \'%s\' are provided by user.',
                        package_uris)
    else:
        local_package = deps_utils.build_ephemeral_package()
        # TODO(b/125451545): Use a safe temp dir instead of jobDir.
        cloud_package = os.path.join(training_inputs['jobDir'],
                                     os.path.basename(local_package))
        io_utils.copy_file(local_package, cloud_package, True)
        training_inputs['packageUris'] = [cloud_package]
        tf.logging.info('Package %s will be used',
                        training_inputs['packageUris'])

    job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

    # Submit job to CMLE
    tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
        job_name, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for CMLE job to finish
    job_id = '{}/jobs/{}'.format(project_id, job_name)
    request = api_client.projects().jobs().get(name=job_id)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        tf.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # CMLE training complete
    tf.logging.info('Job \'{}\' successful.'.format(job_name))
Ejemplo n.º 29
0
def start_cmle_training(input_dict, output_dict, exec_properties,
                        training_inputs):
    """Start a trainer job on CMLE."""
    training_inputs = training_inputs.copy()
    # TODO(khaas): This file goes away when cl/236428692 lands
    # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
    exec_properties['custom_config'].pop('cmle_training_args')

    json_inputs = types.jsonify_tfx_type_dict(input_dict)
    tf.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = types.jsonify_tfx_type_dict(output_dict)
    tf.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties)
    tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure CMLE job
    api_client = discovery.build('ml', 'v1')
    job_args = [
        '--executor', 'Trainer', '--inputs', json_inputs, '--outputs',
        json_outputs, '--exec-properties', json_exec_properties
    ]
    training_inputs['args'] = job_args
    training_inputs['pythonModule'] = 'tfx.scripts.run_executor'

    # Pop project_id so CMLE doesn't complain about an unexpected parameter.
    # It's been a stowaway in cmle_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)

    if 'packageUris' not in training_inputs:
        # Create TFX dist and add it to training_inputs
        local_package = io_utils.build_package()
        cloud_package = os.path.join(training_inputs['jobDir'],
                                     os.path.basename(local_package))
        io_utils.copy_file(local_package, cloud_package, True)
        training_inputs['packageUris'] = [cloud_package]

    job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

    # Submit job to CMLE
    tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
        job_name, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for CMLE job to finish
    job_id = '{}/jobs/{}'.format(project_id, job_name)
    request = api_client.projects().jobs().get(name=job_id)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        tf.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # CMLE training complete
    tf.logging.info('Job \'{}\' successful.'.format(job_name))
Ejemplo n.º 30
0
    def setUp(self):
        super(CliAirflowEndToEndTest, self).setUp()

        # List of packages installed.
        self._pip_list = pip_utils.get_package_names()

        # Check if Apache Airflow is installed before running E2E tests.
        if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list:
            sys.exit('Apache Airflow not installed.')

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Setup airflow_home in a temp directory
        self._airflow_home = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName, 'airflow')
        self._old_airflow_home = os.environ.get('AIRFLOW_HOME')
        os.environ['AIRFLOW_HOME'] = self._airflow_home
        self._old_home = os.environ.get('HOME')
        os.environ['HOME'] = self._airflow_home
        absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test',
                          self._airflow_home)

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        self._pipeline_name = 'chicago_taxi_simple'
        self._pipeline_path = os.path.join(self._testdata_dir,
                                           'test_pipeline_airflow_1.py')

        # Copy data.
        chicago_taxi_pipeline_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))), 'examples',
            'chicago_taxi_pipeline')
        data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple')
        content = fileio.listdir(data_dir)
        assert content, 'content in {} is empty'.format(data_dir)
        target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data',
                                       'simple')
        io_utils.copy_dir(data_dir, target_data_dir)
        assert fileio.isdir(target_data_dir)
        content = fileio.listdir(target_data_dir)
        assert content, 'content in {} is {}'.format(target_data_dir, content)
        io_utils.copy_file(
            os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'),
            os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py'))

        self._mysql_container_name = 'airflow_' + test_utils.generate_random_id(
        )
        db_port = airflow_test_utils.create_mysql_container(
            self._mysql_container_name)
        self.addCleanup(self._cleanup_mysql_container)
        os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = (
            'mysql://[email protected]:%d/airflow' % db_port)
        # Do not load examples to make this a bit faster.
        os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False'

        self._airflow_initdb()

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()
Ejemplo n.º 31
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

        Args:
          input_dict: Input dict from input key to a list of artifacts,
          including:
            - model_export: exported model from trainer.
            - model_blessing: model blessing path from evaluator.
          output_dict: Output dict from key to a list of artifacts, including:
            - model_push: A list of 'ModelPushPath' artifact of size one. It
            will
              include the model in this push execution if the model was pushed.
          exec_properties: Mostly a passthrough input dict for
            tfx.components.Pusher.executor.custom_config
        Raises:
          ValueError: if custom config not present or not a dict.
          RuntimeError: if
        """
        self._log_startup(input_dict, output_dict, exec_properties)

        # check model blessing
        model_push = artifact_utils.get_single_instance(
            output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY])
        if not self.CheckBlessing(input_dict):
            self._MarkNotPushed(model_push)
            return

        model_export = artifact_utils.get_single_instance(
            input_dict[tfx_pusher_executor.MODEL_KEY])

        custom_config = json_utils.loads(
            exec_properties.get(_CUSTOM_CONFIG_KEY, 'null'))
        if custom_config is not None and not isinstance(custom_config, Dict):
            raise ValueError(
                'custom_config in execution properties needs to be a '
                'dict.')

        cortex_serving_args = custom_config.get(SERVING_ARGS_KEY)
        if not cortex_serving_args:
            raise ValueError(
                '\'cortex_serving_args\' is missing in \'custom_config\'')

        # Deploy the model.
        io_utils.copy_dir(src=path_utils.serving_model_path(model_export.uri),
                          dst=model_push.uri)
        model_path = model_push.uri

        # Cortex implementation starts here
        # pop the env and initialize client
        cx = cortex.client(cortex_serving_args.pop('env'))

        # load the predictor
        predictor_path = cortex_serving_args.pop('predictor_path')
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            temp_project_dir = tmp_dir_name

            # predictor
            p_dump_path = os.path.join(temp_project_dir, 'predictor.py')
            io_utils.copy_file(predictor_path, p_dump_path)

            # requirements.txt
            reqs = cortex_serving_args.pop('requirements', [])
            if reqs:
                r_dump_path = os.path.join(temp_project_dir,
                                           'requirements.txt')
                io_utils.write_string_file(r_dump_path, '\n'.join(reqs))

            # conda-packages.txt
            c_reqs = cortex_serving_args.pop('conda_packages', [])
            if c_reqs:
                r_dump_path = os.path.join(temp_project_dir,
                                           'conda-packages.txt')
                io_utils.write_string_file(r_dump_path, '\n'.join(c_reqs))

            # edit the api_config
            api_config = cortex_serving_args.pop('api_config')
            if 'config' not in api_config['predictor']:
                api_config['predictor']['config'] = {}
            api_config['predictor']['config']['model_artifact'] = model_path

            # launch the api
            api_config['predictor']['path'] = 'predictor.py'

            # configure the model path
            if 'models' not in api_config['predictor']:
                api_config['predictor']['models'] = {}
            api_config['predictor']['models'].update({'path': model_path})
            cx.create_api(api_config,
                          project_dir=temp_project_dir,
                          **cortex_serving_args)

        self._MarkPushed(model_push, pushed_destination=model_path)
Ejemplo n.º 32
0
 def _prepare_data(self):
     io_utils.copy_file(
         'data/data.csv',
         f'gs://{self._BUCKET_NAME}/{self._DATA_DIRECTORY_NAME}/' +
         f'{self._pipeline_name}/data.csv')