def testUpdatePipeline(self): # First create pipeline with test_pipeline.py pipeline_path_1 = os.path.join(self.chicago_taxi_pipeline_dir, 'test_pipeline_airflow_1.py') flags_dict_1 = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_DSL_PATH: pipeline_path_1 } handler = airflow_handler.AirflowHandler(flags_dict_1) handler.create_pipeline() # Update test_pipeline and run update_pipeline pipeline_path_2 = os.path.join(self._tmp_dir, 'test_pipeline_airflow_2.py') io_utils.copy_file(pipeline_path_1, pipeline_path_2) flags_dict_2 = { labels.ENGINE_FLAG: self.engine, labels.PIPELINE_DSL_PATH: pipeline_path_2 } handler = airflow_handler.AirflowHandler(flags_dict_2) handler.update_pipeline() handler_pipeline_path = os.path.join( handler._handler_home_dir, self.pipeline_args[labels.PIPELINE_NAME], '') self.assertTrue( tf.io.gfile.exists( os.path.join(handler_pipeline_path, 'test_pipeline_airflow_2.py'))) self.assertTrue( tf.io.gfile.exists( os.path.join(handler_pipeline_path, 'pipeline_args.json')))
def _save_pipeline(self, pipeline_args: Dict[str, Any]) -> None: """Creates/updates pipeline folder in the handler directory. Args: pipeline_args: Pipeline details obtained from DSL. """ pipeline_name = pipeline_args[labels.PIPELINE_NAME] handler_pipeline_path = self._get_pipeline_info_path(pipeline_name) # If updating pipeline, first delete pipeline directory. if fileio.exists(handler_pipeline_path): io_utils.delete_dir(handler_pipeline_path) # Dump pipeline_args to handler pipeline folder as json. fileio.makedirs(handler_pipeline_path) with open(os.path.join( handler_pipeline_path, 'pipeline_args.json'), 'w') as f: json.dump(pipeline_args, f) # Copy dsl to pipeline folder pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH] io_utils.copy_file( pipeline_dsl_path, os.path.join(handler_pipeline_path, os.path.basename(pipeline_dsl_path)))
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """ImportSchemaGen executor entrypoint. This generate Schema artifact with given schema_file. Args: input_dict: Should be empty. output_dict: Output dict from key to a list of artifacts, including: - schema: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - schema_file: Source schema file path. Returns: None """ source_file_path = exec_properties.get( standard_component_specs.SCHEMA_FILE_KEY) if not source_file_path: raise ValueError('Schema file path is missing in exec_properties.') output_uri = os.path.join( artifact_utils.get_single_uri( output_dict[standard_component_specs.SCHEMA_KEY]), schema_gen_executor.DEFAULT_FILE_NAME) # Check whether the input file has a proper schema proto. _ = io_utils.SchemaReader().read(source_file_path) io_utils.copy_file(source_file_path, output_uri) logging.info('Copied a schema file from %s to %s.', source_file_path, output_uri)
def copy_template(flags_dict: Dict[Text, Any]) -> None: """Copy template flags_dict["model"] to flags_dict["dest_dir"]. Copies all *.py and README files in specified template, and replace the content of the files. Args: flags_dict: Should have pipeline_name, model and dest_dir. """ pipeline_name = _sanitize_pipeline_name(flags_dict[labels.PIPELINE_NAME]) template_dir = os.path.join(_templates_src_dir(), flags_dict[labels.MODEL]) destination_dir = flags_dict[labels.DESTINATION_PATH] if not os.path.isdir(template_dir): raise ValueError('Model {} does not exist.'.format( flags_dict[labels.MODEL])) replace_dict = { _IMPORT_FROM_PACKAGE: _IMPORT_FROM_LOCAL_DIR, _PLACEHOLDER_PIPELINE_NAME: pipeline_name, _INTERNAL_TODO_PREFIX: '', } _copy_and_replace_placeholder_dir(template_dir, destination_dir, replace_dict) for additional_file in _ADDITIONAL_FILE_PATHS[flags_dict[labels.MODEL]]: src_path = os.path.join(_tfx_src_dir(), additional_file.src) dst_path = os.path.join(destination_dir, additional_file.dst) io_utils.copy_file(src_path, dst_path)
def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None: """Creates/updates pipeline folder in the handler directory. Args: pipeline_args: Pipeline details obtained from DSL. """ # Path to pipeline folder in Airflow. handler_pipeline_path = os.path.join( self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '') # If updating pipeline, first delete pipeline directory. if tf.io.gfile.exists(handler_pipeline_path): io_utils.delete_dir(handler_pipeline_path) # Dump pipeline_args to handler pipeline folder as json. tf.io.gfile.makedirs(handler_pipeline_path) with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'), 'w') as f: json.dump(pipeline_args, f) # Copy dsl to pipeline folder pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH] io_utils.copy_file( pipeline_dsl_path, os.path.join(handler_pipeline_path, os.path.basename(pipeline_dsl_path)))
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' # Using more than one thread results in a warning for sqlite backend. # See https://github.com/tensorflow/tfx/issues/141 os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1' # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.gfile.ListDirectory(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.gfile.IsDirectory(target_data_dir) content = tf.gfile.ListDirectory(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def _read_schema_from_pipeline_root(self, pipeline_name, pipeline_root): # Check if pipeline root created. If not, it means that the user has not # created a run yet or the pipeline is still running for the first time. if not fileio.exists(pipeline_root): sys.exit( 'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.' ) # If pipeline_root exists, then check if SchemaGen output exists. components = fileio.listdir(pipeline_root) if 'SchemaGen' not in components: sys.exit( 'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.' ) # Get the latest SchemaGen output. component_output_dir = os.path.join(pipeline_root, 'SchemaGen') schema_dir = os.path.join(component_output_dir, 'schema') schemagen_outputs = fileio.listdir(schema_dir) latest_schema_folder = max(schemagen_outputs, key=int) # Copy schema to current dir. latest_schema_uri = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', int(latest_schema_folder)) latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt') curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt') io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True) # Print schema and path to schema click.echo('Path to schema: {}'.format(curr_dir_path)) click.echo('*********SCHEMA FOR {}**********'.format( pipeline_name.upper())) with open(curr_dir_path, 'r') as f: click.echo(f.read())
def _save_pipeline(self, pipeline_args) -> None: """Creates/updates pipeline folder in the handler directory.""" # Path to pipeline folder in airflow. handler_pipeline_path = self._get_handler_pipeline_path( pipeline_args[labels.PIPELINE_NAME]) # If updating pipeline, first delete pipeline directory. if tf.io.gfile.exists(handler_pipeline_path): io_utils.delete_dir(handler_pipeline_path) # Dump pipeline_args to handler pipeline folder as json. tf.io.gfile.makedirs(handler_pipeline_path) with open(os.path.join( handler_pipeline_path, 'pipeline_args.json'), 'w') as f: json.dump(pipeline_args, f) # Copy dsl to pipeline folder io_utils.copy_file( self.flags_dict[labels.PIPELINE_DSL_PATH], os.path.join( handler_pipeline_path, os.path.basename(self.flags_dict[labels.PIPELINE_DSL_PATH]) ) )
def upload_pipeline(self, pipeline_package_path, pipeline_name): # pylint: disable=invalid-name, unused-argument io_utils.copy_file(pipeline_package_path, os.path.join( self._output_dir, os.path.basename(pipeline_package_path)), overwrite=True) return _MockUploadResponse(self.config)
def testCopyFile(self): file_path = os.path.join(self._base_dir, 'temp_file') io_utils.write_string_file(file_path, 'testing') copy_path = os.path.join(self._base_dir, 'copy_file') io_utils.copy_file(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) f = file_io.FileIO(file_path, mode='r') self.assertEqual('testing', f.read()) self.assertEqual(7, f.tell())
def testCopyFile(self): self.createFiles({ 'file1.txt': 'testing' }) io_utils.copy_file(self.relpath('file1.txt'), self.relpath('file2.txt')) self.assertDirectoryEqual(self._base_dir, { 'file1.txt': 'testing', 'file2.txt': 'testing' })
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = str(subprocess.check_output(['pip', 'freeze', '--local'])) # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName, 'airflow') self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) self._airflow_initdb() # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super().setUp() # List of packages installed. self._pip_list = pip_utils.get_package_names() # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join(self.tmp_dir, 'airflow') self.enter_context( test_case_utils.override_env_var('AIRFLOW_HOME', self._airflow_home)) self.enter_context( test_case_utils.override_env_var('HOME', self._airflow_home)) absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._pipeline_name = 'chicago_taxi_simple' self._pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_airflow_1.py') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Copy the input_data to the output_data. For this example that is all that the Executor does. For a different custom component, this is where the real functionality of the component would be included. This component both reads and writes Examples, but a different component might read and write artifacts of other types. Args: input_dict: Input dict from input key to a list of artifacts, including: - input_data: A list of type `standard_artifacts.Examples` which will often contain two splits, 'train' and 'eval'. output_dict: Output dict from key to a list of artifacts, including: - output_data: A list of type `standard_artifacts.Examples` which will usually contain the same splits as input_data. exec_properties: A dict of execution properties, including: - name: Optional unique name. Necessary iff multiple Hello components are declared in the same pipeline. Returns: None Raises: OSError and its subclasses """ self._log_startup(input_dict, output_dict, exec_properties) input_artifact = artifact_utils.get_single_instance( input_dict['input_data']) output_artifact = artifact_utils.get_single_instance( output_dict['output_data']) output_artifact.split_names = input_artifact.split_names split_to_instance = {} for split in json.loads(input_artifact.split_names): uri = artifact_utils.get_split_uri([input_artifact], split) split_to_instance[split] = uri for split, instance in split_to_instance.items(): input_dir = instance output_dir = artifact_utils.get_split_uri([output_artifact], split) for filename in tf.io.gfile.listdir(input_dir): input_uri = os.path.join(input_dir, filename) output_uri = os.path.join(output_dir, filename) io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
def get_schema(self): pipeline_name = self.flags_dict[labels.PIPELINE_NAME] # Check if pipeline exists. self._check_pipeline_existence(pipeline_name) # Path to pipeline args. pipeline_args_path = os.path.join( self._handler_home_dir, self.flags_dict[labels.PIPELINE_NAME], 'pipeline_args.json') # Get pipeline_root. with open(pipeline_args_path, 'r') as f: pipeline_args = json.load(f) # Check if pipeline root created. If not, it means that the user has not # created a run yet or the pipeline is still running for the first time. pipeline_root = pipeline_args[labels.PIPELINE_ROOT] if not tf.io.gfile.exists(pipeline_root): sys.exit( 'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.' ) # If pipeline_root exists, then check if SchemaGen output exists. components = tf.io.gfile.listdir(pipeline_root) if 'SchemaGen' not in components: sys.exit( 'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.' ) # Get the latest SchemaGen output. component_output_dir = os.path.join(pipeline_root, 'SchemaGen') schema1_uri = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', 1) schema_dir = os.path.join(os.path.dirname(schema1_uri), '') schemagen_outputs = tf.io.gfile.listdir(schema_dir) latest_schema_folder = max(schemagen_outputs, key=int) # Copy schema to current dir. latest_schema_uri = base_driver._generate_output_uri( # pylint: disable=protected-access component_output_dir, 'schema', latest_schema_folder) latest_schema_path = os.path.join(latest_schema_uri, 'schema.pbtxt') curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt') io_utils.copy_file(latest_schema_path, curr_dir_path, overwrite=True) # Print schema and path to schema click.echo('Path to schema: {}'.format(curr_dir_path)) click.echo('*********SCHEMA FOR {}**********'.format( pipeline_name.upper())) with open(curr_dir_path, 'r') as f: click.echo(f.read())
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home tf.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.gfile.ListDirectory(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.gfile.IsDirectory(target_data_dir) content = tf.gfile.ListDirectory(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) # Initialize CLI runner. self.runner = click_testing.CliRunner() # Start scheduler. self._scheduler = subprocess.Popen(['airflow', 'scheduler'])
def setUpClass(cls): super(ExecutorTest, cls).setUpClass() source_example_dir = os.path.join(cls._SOURCE_DATA_DIR, 'csv_example_gen') io_utils.copy_dir(source_example_dir, cls._ARTIFACT1_URI) io_utils.copy_dir(source_example_dir, cls._ARTIFACT2_URI) # Duplicate the number of train and eval records such that # second artifact has twice as many as first. artifact2_pattern = os.path.join(cls._ARTIFACT2_URI, '*', '*') artifact2_files = tf.io.gfile.glob(artifact2_pattern) for filepath in artifact2_files: directory, filename = os.path.split(filepath) io_utils.copy_file(filepath, os.path.join(directory, 'dup_' + filename))
def setUp(self): super(CliBeamEndToEndTest, self).setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup beam_home in a temp directory self._home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._home self._old_beam_home = os.environ.get('BEAM_HOME') os.environ['BEAM_HOME'] = os.path.join(self._home, 'beam', '') self._beam_home = os.environ['BEAM_HOME'] # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super().setUp() # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup beam_home in a temp directory self._home = self.tmp_dir self._beam_home = os.path.join(self._home, 'beam') self.enter_context( test_case_utils.override_env_var('BEAM_HOME', self._beam_home)) self.enter_context(test_case_utils.override_env_var( 'HOME', self._home)) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._home, 'taxi', 'taxi_utils.py')) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super().setUp() penguin_examples_dir = os.path.join(self._REPO_BASE, 'tfx', 'examples', 'penguin') # The location of the penguin test data and schema. The input files are # copied to a test-local location for each invocation, and cleaned up at the # end of test. penguin_test_data_root = os.path.join(penguin_examples_dir, 'data') penguin_test_schema_file = os.path.join(penguin_examples_dir, 'schema', 'user_provided', 'schema.pbtxt') # The location of the user module for penguin. Will be packaged and copied # to under the pipeline root before pipeline execution. self._penguin_dependency_file = os.path.join( penguin_examples_dir, 'penguin_utils_cloud_tuner.py') self._penguin_data_root = os.path.join(self._testdata_root, 'data') io_utils.copy_dir(penguin_test_data_root, self._penguin_data_root) self._penguin_schema_file = os.path.join(self._testdata_root, 'schema.pbtxt') io_utils.copy_file(penguin_test_schema_file, self._penguin_schema_file)
def copy_over(input_artifact, output_artifact, splits_to_copy): """ Copy data from specified splits Args: input_artifact: location where the input splits are output_artifact: location where to copy them splits_to_copy: list of split names to copy Returns: None """ split_to_instance = {} for split in splits_to_copy: uri = artifact_utils.get_split_uri(input_artifact, split) split_to_instance[split] = uri for split, instance in split_to_instance.items(): input_dir = instance output_dir = artifact_utils.get_split_uri([output_artifact], split) for filename in tf.io.gfile.listdir(input_dir): input_uri = os.path.join(input_dir, filename) output_uri = os.path.join(output_dir, filename) io_utils.copy_file(src=input_uri, dst=output_uri, overwrite=True)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Push model to target directory if blessed. Args: input_dict: Input dict from input key to a list of artifacts, including: - model: exported model from trainer. - model_blessing: model blessing path from model_validator. A push action delivers the model exports produced by Trainer to the destination defined in component config. output_dict: Output dict from key to a list of artifacts, including: - pushed_model: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: A dict of execution properties, including: - push_destination: JSON string of pusher_pb2.PushDestination instance, providing instruction of destination to push model. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[standard_component_specs.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_path = self.GetModelPath(input_dict) # Push model to the destination, which can be listened by a model server. # # If model is already successfully copied to outside before, stop copying. # This is because model validator might blessed same model twice (check # mv driver) with different blessing output, we still want Pusher to # handle the mv output again to keep metadata tracking, but no need to # copy to outside path again.. # TODO(jyzhao): support rpc push and verification. push_destination = pusher_pb2.PushDestination() proto_utils.json_to_proto( exec_properties[standard_component_specs.PUSH_DESTINATION_KEY], push_destination) destination_kind = push_destination.WhichOneof('destination') if destination_kind == 'filesystem': fs_config = push_destination.filesystem if fs_config.versioning == _Versioning.AUTO: fs_config.versioning = _Versioning.UNIX_TIMESTAMP if fs_config.versioning == _Versioning.UNIX_TIMESTAMP: model_version = str(int(time.time())) else: raise NotImplementedError('Invalid Versioning {}'.format( fs_config.versioning)) logging.info('Model version: %s', model_version) serving_path = os.path.join(fs_config.base_directory, model_version) if fileio.exists(serving_path): logging.info( 'Destination directory %s already exists, skipping current push.', serving_path) else: # For TensorFlow SavedModel, saved_model.pb file should be the last file # to be copied as TF serving and other codes rely on that file as an # indication that the model is available. # https://github.com/tensorflow/tensorflow/blob/d5b3c79b4804134d0d17bfce9f312151f6337dba/tensorflow/python/saved_model/save.py#L1445 io_utils.copy_dir(model_path, serving_path, deny_regex_patterns=[r'saved_model\.pb']) saved_model_path = os.path.join(model_path, 'saved_model.pb') if fileio.exists(saved_model_path): io_utils.copy_file( saved_model_path, os.path.join(serving_path, 'saved_model.pb'), ) logging.info('Model written to serving path %s.', serving_path) else: raise NotImplementedError( 'Invalid push destination {}'.format(destination_kind)) # Copy the model to pushing uri for archiving. io_utils.copy_dir(model_path, model_push.uri) self._MarkPushed(model_push, pushed_destination=serving_path, pushed_version=model_version) logging.info('Model pushed to %s.', model_push.uri)
def setUp(self): super(AirflowEndToEndTest, self).setUp() # setup airflow_home in a temp directory, config and init db. self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) self._mysql_container_name = 'airflow_' + test_utils.generate_random_id( ) db_port = airflow_test_utils.create_mysql_container( self._mysql_container_name) self.addCleanup(airflow_test_utils.delete_mysql_container, self._mysql_container_name) os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = ( 'mysql://[email protected]:%d/airflow' % db_port) # Set a couple of important environment variables. See # https://airflow.apache.org/howto/set-config.html for details. os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join( self._airflow_home, 'dags') os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join( self._airflow_home, 'logs') # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' # Following fields are specific to the chicago_taxi_simple example. self._dag_id = 'chicago_taxi_simple' self._run_id = 'manual_run_id_1' # This execution date must be after the start_date in chicago_taxi_simple # but before current execution date. self._execution_date = '2019-02-01T01:01:01' self._all_tasks = [ 'CsvExampleGen', 'Evaluator', 'ExampleValidator', 'Pusher', 'SchemaGen', 'StatisticsGen', 'Trainer', 'Transform', ] # Copy dag file and data. chicago_taxi_pipeline_dir = os.path.dirname(__file__) simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir, 'taxi_pipeline_simple.py') io_utils.copy_file( simple_pipeline_file, os.path.join(self._airflow_home, 'dags', 'taxi_pipeline_simple.py')) data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. subprocess.run(['airflow', 'initdb'], check=True) subprocess.run(['airflow', 'unpause', self._dag_id], check=True)
def start_cmle_training(input_dict, output_dict, exec_properties, training_inputs): """Start a trainer job on CMLE.""" training_inputs = training_inputs.copy() logger = logging_utils.get_logger(exec_properties['log_root'], 'exec') # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself exec_properties['custom_config'].pop('cmle_training_args') json_inputs = types.jsonify_tfx_type_dict(input_dict) logger.info('json_inputs=\'%s\'.', json_inputs) json_outputs = types.jsonify_tfx_type_dict(output_dict) logger.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties) logger.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure CMLE job api_client = discovery.build('ml', 'v1') job_args = [ '--executor', 'Trainer', '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] training_inputs['args'] = job_args training_inputs['pythonModule'] = 'tfx.scripts.run_executor' # Pop project_id so CMLE doesn't complain about an unexpected parameter. # It's been a stowaway in cmle_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) if 'packageUris' not in training_inputs: # Create TFX dist and add it to training_inputs local_package = io_utils.build_package() cloud_package = os.path.join(training_inputs['jobDir'], os.path.basename(local_package)) io_utils.copy_file(local_package, cloud_package, True) training_inputs['packageUris'] = [cloud_package] job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') job_spec = {'jobId': job_name, 'trainingInput': training_inputs} # Submit job to CMLE logger.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format( job_name, project)) request = api_client.projects().jobs().create( body=job_spec, parent=project_id) request.execute() # Wait for CMLE job to finish job_id = '{}/jobs/{}'.format(project_id, job_name) request = api_client.projects().jobs().get(name=job_id) response = request.execute() while response['state'] not in ('SUCCEEDED', 'FAILED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) response = request.execute() if response['state'] == 'FAILED': err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) logger.error(err_msg) raise RuntimeError(err_msg) # CMLE training complete logger.info('Job \'{}\' successful.'.format(job_name))
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = str( subprocess.check_output(['pip', 'freeze', '--local'])) # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName, 'airflow') self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Set a couple of important environment variables. See # https://airflow.apache.org/howto/set-config.html for details. # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '0' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' os.environ['AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL'] = '0' # Using more than one thread results in a warning for sqlite backend. # See https://github.com/tensorflow/tfx/issues/141 os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1' # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline', '') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) # Start airflow scheduler. self._out = open(os.path.join(self._airflow_home, 'out.txt'), 'w+') self._err = open(os.path.join(self._airflow_home, 'err.txt'), 'w+') self._scheduler = subprocess.Popen(['airflow', 'scheduler']) # Initialize CLI runner. self.runner = click_testing.CliRunner()
def setUp(self): super(AirflowEndToEndTest, self).setUp() # setup airflow_home in a temp directory, config and init db. self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()), self._testMethodName) self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Set a couple of important environment variables. See # https://airflow.apache.org/howto/set-config.html for details. os.environ['AIRFLOW__CORE__AIRFLOW_HOME'] = self._airflow_home os.environ['AIRFLOW__CORE__DAGS_FOLDER'] = os.path.join( self._airflow_home, 'dags') os.environ['AIRFLOW__CORE__BASE_LOG_FOLDER'] = os.path.join( self._airflow_home, 'logs') os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = ( 'sqlite:///%s/airflow.db' % self._airflow_home) # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' # Following environment variables make scheduler process dags faster. os.environ['AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC'] = '1' os.environ['AIRFLOW__SCHEDULER__RUN_DURATION'] = '-1' os.environ['AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL'] = '1' os.environ['AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL'] = '30' # Using more than one thread results in a warning for sqlite backend. # See https://github.com/tensorflow/tfx/issues/141 os.environ['AIRFLOW__SCHEDULER__MAX_THREADS'] = '1' # Following fields are specific to the chicago_taxi_simple example. self._dag_id = 'chicago_taxi_simple' self._run_id = 'manual_run_id_1' # This execution date must be after the start_date in chicago_taxi_simple # but before current execution date. self._execution_date = '2019-02-01T01:01:01+01:01' self._all_tasks = [ 'CsvExampleGen', 'Evaluator', 'ExampleValidator', 'ModelValidator', 'Pusher', 'SchemaGen', 'StatisticsGen', 'Trainer', 'Transform', ] # Copy dag file and data. chicago_taxi_pipeline_dir = os.path.dirname(__file__) simple_pipeline_file = os.path.join(chicago_taxi_pipeline_dir, 'taxi_pipeline_simple.py') io_utils.copy_file( simple_pipeline_file, os.path.join(self._airflow_home, 'dags', 'taxi_pipeline_simple.py')) data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = tf.io.gfile.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert tf.io.gfile.isdir(target_data_dir) content = tf.io.gfile.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) # Initialize database. _ = subprocess.check_output(['airflow', 'initdb']) _ = subprocess.check_output(['airflow', 'unpause', self._dag_id])
def start_cmle_training(input_dict: Dict[Text, List[types.TfxArtifact]], output_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any]): """Start a trainer job on CMLE. This is done by forwarding the inputs/outputs/exec_properties to the tfx.scripts.run_executor module on a CMLE training job interpreter. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input for CMLE training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred by the runner. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Returns: None Raises: RuntimeError: if the Google Cloud AI Platform training job failed. """ training_inputs = training_inputs.copy() # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself for gaip_training_key in ['cmle_training_args', 'gaip_training_args']: if gaip_training_key in exec_properties.get('custom_config'): exec_properties['custom_config'].pop(gaip_training_key) json_inputs = types.jsonify_tfx_type_dict(input_dict) tf.logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = types.jsonify_tfx_type_dict(output_dict) tf.logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties) tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure CMLE job api_client = discovery.build('ml', 'v1') job_args = [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] training_inputs['args'] = job_args training_inputs['pythonModule'] = 'tfx.scripts.run_executor' training_inputs['pythonVersion'] = _get_caip_python_version() # runtimeVersion should be same as <major>.<minor> of currently # installed tensorflow version. training_inputs['runtimeVersion'] = _get_tf_runtime_version() # Pop project_id so CMLE doesn't complain about an unexpected parameter. # It's been a stowaway in cmle_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) package_uris = training_inputs.get('packageUris', []) if package_uris: tf.logging.info('Following packageUris \'%s\' are provided by user.', package_uris) else: local_package = deps_utils.build_ephemeral_package() # TODO(b/125451545): Use a safe temp dir instead of jobDir. cloud_package = os.path.join(training_inputs['jobDir'], os.path.basename(local_package)) io_utils.copy_file(local_package, cloud_package, True) training_inputs['packageUris'] = [cloud_package] tf.logging.info('Package %s will be used', training_inputs['packageUris']) job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') job_spec = {'jobId': job_name, 'trainingInput': training_inputs} # Submit job to CMLE tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format( job_name, project)) request = api_client.projects().jobs().create(body=job_spec, parent=project_id) request.execute() # Wait for CMLE job to finish job_id = '{}/jobs/{}'.format(project_id, job_name) request = api_client.projects().jobs().get(name=job_id) response = request.execute() while response['state'] not in ('SUCCEEDED', 'FAILED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) response = request.execute() if response['state'] == 'FAILED': err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) tf.logging.error(err_msg) raise RuntimeError(err_msg) # CMLE training complete tf.logging.info('Job \'{}\' successful.'.format(job_name))
def start_cmle_training(input_dict, output_dict, exec_properties, training_inputs): """Start a trainer job on CMLE.""" training_inputs = training_inputs.copy() # TODO(khaas): This file goes away when cl/236428692 lands # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself exec_properties['custom_config'].pop('cmle_training_args') json_inputs = types.jsonify_tfx_type_dict(input_dict) tf.logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = types.jsonify_tfx_type_dict(output_dict) tf.logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties) tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure CMLE job api_client = discovery.build('ml', 'v1') job_args = [ '--executor', 'Trainer', '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] training_inputs['args'] = job_args training_inputs['pythonModule'] = 'tfx.scripts.run_executor' # Pop project_id so CMLE doesn't complain about an unexpected parameter. # It's been a stowaway in cmle_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) if 'packageUris' not in training_inputs: # Create TFX dist and add it to training_inputs local_package = io_utils.build_package() cloud_package = os.path.join(training_inputs['jobDir'], os.path.basename(local_package)) io_utils.copy_file(local_package, cloud_package, True) training_inputs['packageUris'] = [cloud_package] job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') job_spec = {'jobId': job_name, 'trainingInput': training_inputs} # Submit job to CMLE tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format( job_name, project)) request = api_client.projects().jobs().create(body=job_spec, parent=project_id) request.execute() # Wait for CMLE job to finish job_id = '{}/jobs/{}'.format(project_id, job_name) request = api_client.projects().jobs().get(name=job_id) response = request.execute() while response['state'] not in ('SUCCEEDED', 'FAILED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) response = request.execute() if response['state'] == 'FAILED': err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) tf.logging.error(err_msg) raise RuntimeError(err_msg) # CMLE training complete tf.logging.info('Job \'{}\' successful.'.format(job_name))
def setUp(self): super(CliAirflowEndToEndTest, self).setUp() # List of packages installed. self._pip_list = pip_utils.get_package_names() # Check if Apache Airflow is installed before running E2E tests. if labels.AIRFLOW_PACKAGE_NAME not in self._pip_list: sys.exit('Apache Airflow not installed.') # Change the encoding for Click since Python 3 is configured to use ASCII as # encoding for the environment. if codecs.lookup(locale.getpreferredencoding()).name == 'ascii': os.environ['LANG'] = 'en_US.utf-8' # Setup airflow_home in a temp directory self._airflow_home = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName, 'airflow') self._old_airflow_home = os.environ.get('AIRFLOW_HOME') os.environ['AIRFLOW_HOME'] = self._airflow_home self._old_home = os.environ.get('HOME') os.environ['HOME'] = self._airflow_home absl.logging.info('Using %s as AIRFLOW_HOME and HOME in this e2e test', self._airflow_home) # Testdata path. self._testdata_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._pipeline_name = 'chicago_taxi_simple' self._pipeline_path = os.path.join(self._testdata_dir, 'test_pipeline_airflow_1.py') # Copy data. chicago_taxi_pipeline_dir = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__))))), 'examples', 'chicago_taxi_pipeline') data_dir = os.path.join(chicago_taxi_pipeline_dir, 'data', 'simple') content = fileio.listdir(data_dir) assert content, 'content in {} is empty'.format(data_dir) target_data_dir = os.path.join(self._airflow_home, 'taxi', 'data', 'simple') io_utils.copy_dir(data_dir, target_data_dir) assert fileio.isdir(target_data_dir) content = fileio.listdir(target_data_dir) assert content, 'content in {} is {}'.format(target_data_dir, content) io_utils.copy_file( os.path.join(chicago_taxi_pipeline_dir, 'taxi_utils.py'), os.path.join(self._airflow_home, 'taxi', 'taxi_utils.py')) self._mysql_container_name = 'airflow_' + test_utils.generate_random_id( ) db_port = airflow_test_utils.create_mysql_container( self._mysql_container_name) self.addCleanup(self._cleanup_mysql_container) os.environ['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = ( 'mysql://[email protected]:%d/airflow' % db_port) # Do not load examples to make this a bit faster. os.environ['AIRFLOW__CORE__LOAD_EXAMPLES'] = 'False' self._airflow_initdb() # Initialize CLI runner. self.runner = click_testing.CliRunner()
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from evaluator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor.custom_config Raises: ValueError: if custom config not present or not a dict. RuntimeError: if """ self._log_startup(input_dict, output_dict, exec_properties) # check model blessing model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict.') cortex_serving_args = custom_config.get(SERVING_ARGS_KEY) if not cortex_serving_args: raise ValueError( '\'cortex_serving_args\' is missing in \'custom_config\'') # Deploy the model. io_utils.copy_dir(src=path_utils.serving_model_path(model_export.uri), dst=model_push.uri) model_path = model_push.uri # Cortex implementation starts here # pop the env and initialize client cx = cortex.client(cortex_serving_args.pop('env')) # load the predictor predictor_path = cortex_serving_args.pop('predictor_path') with tempfile.TemporaryDirectory() as tmp_dir_name: temp_project_dir = tmp_dir_name # predictor p_dump_path = os.path.join(temp_project_dir, 'predictor.py') io_utils.copy_file(predictor_path, p_dump_path) # requirements.txt reqs = cortex_serving_args.pop('requirements', []) if reqs: r_dump_path = os.path.join(temp_project_dir, 'requirements.txt') io_utils.write_string_file(r_dump_path, '\n'.join(reqs)) # conda-packages.txt c_reqs = cortex_serving_args.pop('conda_packages', []) if c_reqs: r_dump_path = os.path.join(temp_project_dir, 'conda-packages.txt') io_utils.write_string_file(r_dump_path, '\n'.join(c_reqs)) # edit the api_config api_config = cortex_serving_args.pop('api_config') if 'config' not in api_config['predictor']: api_config['predictor']['config'] = {} api_config['predictor']['config']['model_artifact'] = model_path # launch the api api_config['predictor']['path'] = 'predictor.py' # configure the model path if 'models' not in api_config['predictor']: api_config['predictor']['models'] = {} api_config['predictor']['models'].update({'path': model_path}) cx.create_api(api_config, project_dir=temp_project_dir, **cortex_serving_args) self._MarkPushed(model_push, pushed_destination=model_path)
def _prepare_data(self): io_utils.copy_file( 'data/data.csv', f'gs://{self._BUCKET_NAME}/{self._DATA_DIRECTORY_NAME}/' + f'{self._pipeline_name}/data.csv')