def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: encoding = exec_properties["encoding"] merged_text_dir = get_single_uri(input_dict["merged_text_dir"]) encoding_dir = get_single_uri(input_dict["encoding_dir"]) end_token = exec_properties["end_token"] logging.info("encoding as: {}".format(encoding)) logging.info("merged text dir: {}".format(merged_text_dir)) logging.info("encoding dir: {}".format(encoding_dir)) logging.info("ending tokens: {}".format(end_token)) logging.info('Reading files') enc = encoder.get_encoder(encoding_dir) chunks = load_dataset(enc, merged_text_dir, encoding=encoding, end_token=end_token) logging.info("chunk size: {}".format(len(chunks))) logging.info("top 10 chunkds {}".format(chunks[:10])) dataset_path = os.path.join(get_single_uri(output_dict["dataset_dir"]), "dataset.npz") logging.info('Writing', dataset_path) np.savez_compressed(dataset_path, *chunks)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: model_name = exec_properties["model_name"] mlflow_tracking_url = exec_properties["mlflow_tracking_url"] model_dir = get_single_uri(input_dict["model_dir"]) artifact_dir = get_single_uri(input_dict["artifact_dir"]) hyperparameter_dir = get_single_uri(input_dict["hyperparameter_dir"]) metric_dir = get_single_uri(input_dict["metric_dir"]) mlflow.set_tracking_uri(mlflow_tracking_url) mlflow.set_experiment(model_name) with mlflow.start_run(): with open(glob.glob(os.path.join(hyperparameter_dir, "*.pickle"))[0], 'rb') as fp: hyperparameter = pickle.load(fp) for k, v in hyperparameter.items(): mlflow.log_param(k, v) with open(glob.glob(os.path.join(metric_dir, "*.pickle"))[0], 'rb') as fp: metric = pickle.load(fp) for k, v in metric.items(): mlflow.log_metric(k, v) for artifact in glob.glob(os.path.join(artifact_dir, "*")): mlflow.log_artifact(artifact) with open(glob.glob(os.path.join(model_dir, "*.pickle"))[0], 'rb') as fp: mlflow.tensorflow.log_model(tf_saved_model_dir=model_dir, tf_meta_graph_tags=["serve"], tf_signature_def_key="predict", artifact_path="GPT2")
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) logging.info('Validating schema against the computed statistics.') split_uris: List[Text] = [] for artifact in input_dict[executor.STATISTICS_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): split_uris.append(split) label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri( input_dict[executor.STATISTICS_KEY], split_uris[0]))), labels.SCHEMA: io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[executor.SCHEMA_KEY]))) } output_uri = artifact_utils.get_single_uri( output_dict[executor.ANOMALIES_KEY]) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: # KerasTuner generates tuning state (e.g., oracle, trials) to working dir. working_dir = self._get_tmp_dir() train_path = artifact_utils.get_split_uri(input_dict['examples'], 'train') eval_path = artifact_utils.get_split_uri(input_dict['examples'], 'eval') schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) tuner_fn = self._GetTunerFn(exec_properties) tuner_spec = tuner_fn(working_dir, io_utils.all_files_pattern(train_path), io_utils.all_files_pattern(eval_path), schema) tuner = tuner_spec.tuner tuner.search_space_summary() # TODO(jyzhao): assert v2 behavior as KerasTuner doesn't work in v1. # TODO(jyzhao): make epochs configurable. tuner.search( tuner_spec.train_dataset, epochs=5, validation_data=tuner_spec.eval_dataset) tuner.results_summary() best_hparams = tuner.oracle.get_best_trials( 1)[0].hyperparameters.get_config() best_hparams_path = os.path.join( artifact_utils.get_single_uri(output_dict['study_best_hparams_path']), _DEFAULT_FILE_NAME) io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams)) absl.logging.info('Best HParams is written to %s.' % best_hparams_path)
def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs: if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) if result.custom_config and not isinstance(result.custom_config, dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(result.custom_config)) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.hyperparameters = hyperparameters_config return result
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): self._log_startup(input_dict, output_dict, exec_properties) schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) groups = group_stats_and_examples(input_dict) for examples, datasets in groups: datasets = DatasetFeatureStatisticsList( datasets=list(datasets.values())) partitions = lists_to_partitions( datasets, schema, examples, partition_fn(datasets, schema, examples)) for partition in partitions: output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[PARTITIONS_KEY]), partition.name) io_utils.write_pbtxt_file( os.path.join(output_uri, 'schema.pbtxt'), partition.schema) for i in range(0, len(partition.statistics.datasets)): dataset = partition.statistics.datasets[i] example_splits = partition.example_splits[i] io_utils.write_tfrecord_file( os.path.join(output_uri, example_splits.split, 'stats_tfrecord'), dataset)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Returns: None """ if 'model_exports' not in input_dict: raise ValueError('\'model_exports\' is missing in input dict.') if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') self._log_startup(input_dict, output_dict, exec_properties) # Extract input artifacts model_exports_uri = artifact_utils.get_single_uri( input_dict['model_exports']) feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) output_uri = artifact_utils.get_single_uri(output_dict['output']) eval_model_path = path_utils.eval_model_path(model_exports_uri) tf.logging.info('Using {} for model eval.'.format(eval_model_path)) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_path) tf.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval'))) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, slice_spec=slice_spec, output_path=output_uri)) tf.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri))
def get_common_fn_args(input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], working_dir: Text = None) -> FnArgs: """Get common args of training and tuning.""" train_files = [ io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'train')) ] eval_files = [ io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'eval')) ] if input_dict.get(constants.TRANSFORM_GRAPH_KEY): transform_graph_path = artifact_utils.get_single_uri( input_dict[constants.TRANSFORM_GRAPH_KEY]) else: transform_graph_path = None if input_dict.get(constants.SCHEMA_KEY): schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA_KEY])) else: schema_path = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties[constants.TRAIN_ARGS_KEY], train_args) json_format.Parse(exec_properties[constants.EVAL_ARGS_KEY], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None # TODO(b/156929910): Refactor Trainer to be consistent with empty or None # custom_config handling. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) return FnArgs( working_dir=working_dir, train_files=train_files, eval_files=eval_files, train_steps=train_steps, eval_steps=eval_steps, schema_path=schema_path, transform_graph_path=transform_graph_path, custom_config=custom_config, )
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: model_name = exec_properties["model_name"] encoding = exec_properties["encoding"] train_config = exec_properties["train_config"] end_token = exec_properties["end_token"] dataset_dir = get_single_uri(input_dict["dataset_dir"]) checkpoint_dir = get_single_uri(input_dict["checkpoint_dir"]) encoding_dir = get_single_uri(input_dict["encoding_dir"]) trained_checkpoint_dir = get_single_uri( output_dict["trained_checkpoint_dir"]) sample_dir = get_single_uri(output_dict["sample_dir"]) tensorboard_dir = get_single_uri(output_dict["tensorboard_dir"]) hyperparameter_dir = get_single_uri(output_dict["hyperparameter_dir"]) metric_dir = get_single_uri(output_dict["metric_dir"]) train_config, metrics = train_gpt2( dataset_dir=dataset_dir, checkpoint_dir=checkpoint_dir, encoding_dir=encoding_dir, model_name=model_name, train_config=train_config, encoding=encoding, trained_checkpoint_dir=trained_checkpoint_dir, sample_dir=sample_dir, tensorboard_dir=tensorboard_dir, end_token=end_token) with open(os.path.join(hyperparameter_dir, 'hyperparameter.pickle'), 'wb') as handle: pickle.dump(train_config, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(metric_dir, 'metric.pickle'), 'wb') as handle: pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: train_config = exec_properties["train_config"] checkpoint_dir = get_single_uri(input_dict["checkpoint_dir"]) model_path = get_single_uri(input_dict["model_path"]) export_dir = get_single_uri(output_dict["export_dir"]) export_for_serving(model_path=model_path, checkpoint_dir=checkpoint_dir, export_dir=export_dir, train_config=train_config)
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs: # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.base_model = base_model result.hyperparameters = hyperparameters_config result.custom_config = custom_config return result
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: encoding = exec_properties["encoding"] combine = exec_properties["combine"] text_path = exec_properties["text_path"] model_path = get_single_uri(input_dict["model_path"]) dataset_path = os.path.join( get_single_uri(output_dict["dataset_path"]), "dataset.npz") enc = encoder.get_encoder(model_path) logging.info('Reading files') chunks = load_dataset(enc, text_path, combine, encoding=encoding) logging.info('Writing', dataset_path) np.savez_compressed(dataset_path, *chunks)
def _JsonToExample( pipeline: beam.Pipeline, input_dict: Dict[Text, List[Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text, ) -> beam.pvalue.PCollection: input_base_uri = artifact_utils.get_single_uri(input_dict[INPUT_KEY]) json_pattern = os.path.join(input_base_uri, split_pattern) logging.info( 'Processing input json data {} to TFExample.'.format(json_pattern)) json_files = tf.io.gfile.glob(json_pattern) if not json_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(json_pattern)) parsed_json_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=json_pattern) | 'ParseJSONLine' >> beam.ParDo(ParseJsonLine())) value_infos = beam.pvalue.AsSingleton( parsed_json_lines | 'InferColumnTypes' >> beam.CombineGlobally(ValueTypeInferrer())) return (parsed_json_lines | 'ToTFExample' >> beam.ParDo(_ParsedJsonToTfExample(), value_infos))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.ai_platform_serving_args is consumed by this class. For the full set of parameters supported by Google Cloud AI Platform, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Returns: None Raises: ValueError: if ai_platform_serving_args is not in exec_properties.custom_config. RuntimeError: if the Google Cloud AI Platform training job failed. """ self._log_startup(input_dict, output_dict, exec_properties) if not self.CheckBlessing(input_dict, output_dict): return model_export = artifact_utils.get_single_instance( input_dict['model_export']) model_export_uri = model_export.uri model_blessing_uri = artifact_utils.get_single_uri( input_dict['model_blessing']) model_push = artifact_utils.get_single_instance( output_dict['model_push']) # TODO(jyzhao): should this be in driver or executor. if not tf.gfile.Exists(os.path.join(model_blessing_uri, 'BLESSED')): model_push.set_int_custom_property('pushed', 0) tf.logging.info('Model on %s was not blessed', model_blessing_uri) return exec_properties_copy = exec_properties.copy() custom_config = exec_properties_copy.pop('custom_config', {}) ai_platform_serving_args = custom_config['ai_platform_serving_args'] # Deploy the model. model_path = path_utils.serving_model_path(model_export_uri) # Note: we do not have a logical model version right now. This # model_version is a timestamp mapped to trainer's exporter. model_version = os.path.basename(model_path) if ai_platform_serving_args is not None: runner.deploy_model_for_cmle_serving(model_path, model_version, ai_platform_serving_args) # Make sure artifacts are populated in a standard way by calling # tfx.pusher.executor.Executor.Do(). exec_properties_copy['push_destination'] = exec_properties.get( 'push_destination') or self._make_local_temp_destination() super(Executor, self).Do(input_dict, output_dict, exec_properties_copy)
def GetStatsOutputPathEntries( disable_statistics: bool, output_dict: Dict[str, List[types.Artifact]]) -> Dict[str, str]: """Returns output entries for stats output path.""" label_component_key_list = [ (labels.PRE_TRANSFORM_OUTPUT_STATS_PATH_LABEL, standard_component_specs.PRE_TRANSFORM_STATS_KEY), (labels.PRE_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL, standard_component_specs.PRE_TRANSFORM_SCHEMA_KEY), (labels.POST_TRANSFORM_OUTPUT_ANOMALIES_PATH_LABEL, standard_component_specs.POST_TRANSFORM_ANOMALIES_KEY), (labels.POST_TRANSFORM_OUTPUT_STATS_PATH_LABEL, standard_component_specs.POST_TRANSFORM_STATS_KEY), (labels.POST_TRANSFORM_OUTPUT_SCHEMA_PATH_LABEL, standard_component_specs.POST_TRANSFORM_SCHEMA_KEY) ] result = {} if not disable_statistics: for label, component_key in label_component_key_list: if component_key in output_dict: result[label] = artifact_utils.get_single_uri( output_dict[component_key]) if result and len(result) != len(label_component_key_list): raise ValueError( 'Either all stats_output_paths should be specified or none.') return result
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """ImportSchemaGen executor entrypoint. This generate Schema artifact with given schema_file. Args: input_dict: Should be empty. output_dict: Output dict from key to a list of artifacts, including: - schema: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - schema_file: Source schema file path. Returns: None """ source_file_path = exec_properties.get( standard_component_specs.SCHEMA_FILE_KEY) if not source_file_path: raise ValueError('Schema file path is missing in exec_properties.') output_uri = os.path.join( artifact_utils.get_single_uri( output_dict[standard_component_specs.SCHEMA_KEY]), schema_gen_executor.DEFAULT_FILE_NAME) # Check whether the input file has a proper schema proto. _ = io_utils.SchemaReader().read(source_file_path) io_utils.copy_file(source_file_path, output_uri) logging.info('Copied a schema file from %s to %s.', source_file_path, output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: client = MongoClient(host=exec_properties["ip"], port=int(exec_properties["port"]), username=exec_properties["username"], password=exec_properties["password"]) dbname = exec_properties["dbname"] db = client[dbname] colnames = exec_properties["colnames"] end_token = exec_properties["end_token"] merged_text_dir = get_single_uri(output_dict["merged_text_dir"]) raw_text = "" for colname in colnames: logging.info("Get data from {}/{}".format(dbname, colname)) documents = db[colname].find({}, {"text": 1, "_id": 0}) for document in documents: raw_text += document["text"] + end_token # store raw text for encoding merged_text_path = os.path.join(merged_text_dir, "merged_text") with open(merged_text_path, "w") as text_file: text_file.write(raw_text) logging.info("Saving merged text to {}".format(merged_text_dir))
def _ImportExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read TFRecord files to PCollection of TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains tf example data. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) input_split_pattern = os.path.join(input_base_uri, split_pattern) absl.logging.info( 'Reading input TFExample data {}.'.format(input_split_pattern)) # TODO(jyzhao): profile input examples. return ( pipeline # TODO(jyzhao): support multiple input format. | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(file_pattern=input_split_pattern) # TODO(jyzhao): consider move serialization out of base example gen. | 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: if exec_properties.get(_TUNE_ARGS_KEY): raise ValueError( "TuneArgs is not supported for default Tuner's Executor.") tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, self._get_tmp_dir()) tuner_fn_result = tuner_fn(fn_args) tuner = tuner_fn_result.tuner fit_kwargs = tuner_fn_result.fit_kwargs # TODO(b/156966497): set logger for printing. tuner.search_space_summary() absl.logging.info('Start tuning...') tuner.search(**fit_kwargs) tuner.results_summary() best_hparams_config = tuner.get_best_hyperparameters()[0].get_config() absl.logging.info('Best hyperParameters: %s' % best_hparams_config) best_hparams_path = os.path.join( artifact_utils.get_single_uri( output_dict[_BEST_HYPERPARAMETERS_KEY]), _DEFAULT_FILE_NAME) io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams_config)) absl.logging.info('Best Hyperparameters are written to %s.' % best_hparams_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: model_dir = get_single_uri(output_dict["model_dir"]) model_name = exec_properties["model_name"] logging.info("Downloading pretrained model of {}".format(model_name)) logging.info("Storing pretrained mdoel to {}".format(model_dir)) subdir = os.path.join('models', model_name) subdir = subdir.replace('\\', '/') # needed for Windows for filename in [ 'checkpoint', 'encoder.json', 'hparams.json', 'model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe' ]: logging.info("Getting {}".format(filename)) # get file from storage server r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir + "/" + filename, stream=True) # save to output path with open(os.path.join(model_dir, filename), 'wb') as f: file_size = int(r.headers["content-length"]) chunk_size = 1000 with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: # 1k for chunk_size, since Ethernet packet size is around 1500 bytes for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) pbar.update(chunk_size)
def _AvroToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read Avro files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains Avro data. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) avro_pattern = os.path.join(input_base_uri, split_pattern) tf.logging.info( 'Processing input avro data {} to TFExample.'.format(avro_pattern)) return (pipeline | 'ReadFromAvro' >> beam.io.ReadFromAvro(avro_pattern) | 'ToTFExample' >> beam.Map(dict_to_example))
def CheckBlessing(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]]) -> bool: """Check that model is blessed by upstream ModelValidator, or update output. Args: input_dict: Input dict from input key to a list of artifacts: - model_blessing: model blessing path from model_validator. Pusher looks for a file named 'BLESSED' to consider the model blessed and safe to push. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. Returns: True if the model is blessed by validator. """ model_blessing_uri = artifact_utils.get_single_uri( input_dict['model_blessing']) model_push = artifact_utils.get_single_instance( output_dict['model_push']) # TODO(jyzhao): should this be in driver or executor. if not tf.io.gfile.exists(os.path.join(model_blessing_uri, 'BLESSED')): model_push.set_int_custom_property('pushed', 0) absl.logging.info('Model on %s was not blessed', model_blessing_uri) return False return True
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Stores `custom_config` as an artifact of type `artifacts.PipelineConfiguration`. Args: input_dict: Empty output_dict: Output dict from key to a list of artifacts, including: - pipeline_configuration: A list of type `artifacts.PipelineConfiguration` exec_properties: A dict of execution properties, including: - custom_config: the configuration to save. Returns: None Raises: OSError and its subclasses ValueError """ self._log_startup(input_dict, output_dict, exec_properties) pipeline_configuration = artifact_utils.get_single_instance(output_dict[PIPELINE_CONFIGURATION_KEY]) custom_config = exec_properties.get(CUSTOM_CONFIG_KEY, "{}") output_dir = artifact_utils.get_single_uri([pipeline_configuration]) output_file = os.path.join(output_dir, 'custom_config.json') io_utils.write_string_file(output_file, custom_config)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. - 'schema': A singleton list of 'Schema' type. If provided, pass it through as the output as fixed schema. If not provided, infer schema from stats. If both or neither 'stats/statistics' nor 'schema' is provided, an error is raised. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. Returns: None """ output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) # Materializing schema as an output artifact from SchemaGen, in order to log # metadata of it in the same way regardless of inferred or fixed. io_utils.write_pbtxt_file( output_uri, self._provide_schema(input_dict, exec_properties)) absl.logging.info('Schema written to {}.'.format(output_uri))
def _provide_schema(self, input_dict, exec_properties) -> schema_pb2.Schema: """Generates schema from either schema or statistics.""" # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. stats = input_dict.get('stats') or input_dict.get('statistics') schema = input_dict.get('schema') if bool(stats) == bool(schema): raise ValueError( 'Exactly only one of schema or stats must be provided') if schema: schema_uri = artifact_utils.get_single_uri(schema) absl.logging.info('Schema is provided. Reading from %s.' % schema_uri) schema_reader = io_utils.SchemaReader() try: return schema_reader.read( os.path.join(schema_uri, _DEFAULT_FILE_NAME)) except tf.errors.NotFoundError: raise ValueError( 'Schema is provided, but failed to read from %s.' % schema_uri) train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(stats, 'train')) infer_feature_shape = exec_properties['infer_feature_shape'] return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = exec_properties['infer_feature_shape'] absl.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) absl.logging.info('Schema written to %s.' % output_uri)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: if tfx_tuner.get_tune_args(exec_properties): raise ValueError( "TuneArgs is not supported by this Tuner's Executor.") metalearning_algorithm = None if 'metalearning_algorithm' in exec_properties: metalearning_algorithm = exec_properties.get( 'metalearning_algorithm') warmup_trials = 0 warmup_trial_data = None if metalearning_algorithm: warmup_tuner, warmup_trials = self.warmup(input_dict, exec_properties, metalearning_algorithm) warmup_trial_data = extract_tuner_trial_progress(warmup_tuner) else: logging.info('MetaLearning Algorithm not provided.') # Create new fn_args for final tuning stage. fn_args = fn_args_utils.get_common_fn_args( input_dict, exec_properties, working_dir=self._get_tmp_dir()) tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') tuner_fn_result = tuner_fn(fn_args) tuner_fn_result.tuner.oracle.max_trials = max( (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1) tuner = self.search(tuner_fn_result) tuner_trial_data = extract_tuner_trial_progress(tuner) if warmup_trial_data: cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data( warmup_trial_data, tuner_trial_data) cumulative_tuner_trial_data[ 'warmup_trial_data'] = warmup_trial_data[BEST_CUMULATIVE_SCORE] cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[ BEST_CUMULATIVE_SCORE] if isinstance(tuner.oracle.objective, kerastuner.Objective): cumulative_tuner_trial_data[ 'objective'] = tuner.oracle.objective.name else: cumulative_tuner_trial_data[ 'objective'] = 'objective not understood' tuner_trial_data = cumulative_tuner_trial_data best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner else: best_tuner = tuner tfx_tuner.write_best_hyperparameters(best_tuner, output_dict) tuner_plot_path = os.path.join( artifact_utils.get_single_uri(output_dict['trial_summary_plot']), 'tuner_plot_data.txt') io_utils.write_string_file(tuner_plot_path, json.dumps(tuner_trial_data)) logging.info('Tuner plot data written at: %s', tuner_plot_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: crawler = Crawler(exec_properties["url"]) rss_feed = crawler.get_article_information_as_dataframe() output_path = os.path.join(get_single_uri(output_dict["rss_feed"]), "feed.csv") rss_feed.to_csv(output_path, index=False)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[STATISTICS_KEY]) for split in artifact_utils.decode_split_names(stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( os.path.join(stats_artifact.uri, split)) if not schema: schema = tfdv.infer_schema( tfdv.load_statistics(stats_uri), infer_feature_shape) else: schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri), infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]), _DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) csv_pattern = os.path.join(input_base_uri, split_pattern) absl.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_pattern)) csv_files = tf.io.gfile.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_files in csv_files[1:]: if io_utils.load_csv_column_names(csv_files) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))