def run(argv=None): """Runs the sparse measurements prediction pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) predict_options = pipeline_options.view_as(PredictOptions) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' with beam.Pipeline(options=pipeline_options) as p: examples = (p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=predict_options.input, compression_type=CompressionTypes.GZIP)) predictions = examples | 'Predict' >> beam.ParDo( PredictDoFn(model_export_dir=predict_options.model)) _ = predictions | 'WriteTableRows' >> beam.io.Write( beam.io.BigQuerySink( predict_options.output, schema=get_table_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
def run(argv=None): """Runs the revise preprocessed data pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) revise_options = pipeline_options.view_as(ReviseOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join( revise_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'relabel-examples-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(revise_options.metadata, 'r').read()).render(METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the tf.Example protos into a PCollection. examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=revise_options.input, compression_type=CompressionTypes.GZIP) # Filter the TensorFlow Example Protocol Buffers. filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap( lambda example, samples_metadata: filter_and_revise_example( example, samples_metadata), beam.pvalue.AsSingleton(samples_metadata))) # Write the subset of tf.Example protos to Cloud Storage. _ = (filtered_examples | 'SerializeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))
def run_pipeline(beam_options): import tensorflow as tf with beam.Pipeline(options=beam_options) as p: ( p | "Read files in" >> \ tfrecordio.ReadFromTFRecord(beam_options.input) | "Parse TF Examples from file" >> \ beam.Map(lambda row: tf.io.parse_example( row, feature_description) ) | "Replace TF tensors with native types" >> \ beam.Map(reformat_row) | "Write to Parquet" >> \ parquetio.WriteToParquet( beam_options.output, pa.schema(parquet_schema), num_shards=128 ) )
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) train_data = (pipeline | 'ReadTrain' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*')) | 'DecodeTrain' >> beam.Map(coder.decode)) test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord( os.path.join(working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*')) | 'DecodeTest' >> beam.Map(coder.decode)) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode) | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode) | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and # transform_fn_io.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(working_dir))
def tftransform( pipeline_args, # type: List[str] temp_location, # type: str schema_file, # type: str output_dir, # type: str preprocessing_fn, # type: Any training_data=None, # type: Union[None, str] evaluation_data=None, # type: Union[None, str] transform_fn_dir=None, # type: Union[None, str] compression_type=None # type: str ): # type: (...) -> PipelineState """ Generic tf.transform pipeline that takes tf.{example, record} training and evaluation datasets and outputs transformed data together with transform function Saved Model. :param pipeline_args: un-parsed Dataflow arguments :param temp_location: temporary location for dataflow job working dir :param schema_file: path to the raw feature schema text file :param output_dir: output dir for transformed data and function :param preprocessing_fn: tf.transform preprocessing function :param training_data: path to the training data :param evaluation_data: path to the evaluation data :param transform_fn_dir: dir to previously saved transformation function to apply :param compression_type: compression type for writing of tf.records :return final state of the Beam pipeline """ assert_not_empty_string(temp_location) assert_not_empty_string(schema_file) assert_not_empty_string(output_dir) assert_not_none(preprocessing_fn) if compression_type is None: compression_type = CompressionTypes.AUTO raw_feature_spec = schema_txt_file_to_feature_spec(schema_file) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) raw_data_coder = ExampleProtoCoder(raw_data_metadata.schema) transformed_train_output_dir = os.path.join(output_dir, "training") transformed_eval_output_dir = os.path.join(output_dir, "evaluation") if not any(i.startswith("--job_name") for i in pipeline_args): pipeline_args.append("--job_name=tf-transform-{}-{}".format( getpass.getuser(), int(time.time()))) pipeline = beam.Pipeline(argv=pipeline_args) with beam_impl.Context(temp_dir=temp_location): if training_data is not None: # if training data is provided, transform_fn_dir will be ignored if transform_fn_dir is not None: warnings.warn( "Transform_fn_dir is ignored because training_data is provided" ) transform_fn_output = os.path.join(output_dir, "transform_fn", "saved_model.pb") if FileSystems.exists(transform_fn_output): raise ValueError("Transform fn already exists at %s!" % transform_fn_output) # compute the transform_fn and apply to the training data raw_train_data = (pipeline | "ReadTrainData" >> tfrecordio.ReadFromTFRecord( training_data, coder=raw_data_coder)) ((transformed_train_data, transformed_train_metadata), transform_fn) = ( (raw_train_data, raw_data_metadata) | ("AnalyzeAndTransformTrainData" >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) ) # noqa: E501 _ = ( # noqa: F841 transform_fn | "WriteTransformFn" >> transform_fn_io.WriteTransformFn(output_dir)) transformed_train_coder = ExampleProtoCoder( transformed_train_metadata.schema) _ = ( # noqa: F841 transformed_train_data | "WriteTransformedTrainData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_train_output_dir, "part"), # noqa: E501 coder=transformed_train_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 else: if transform_fn_dir is None: raise ValueError( "Either training_data or transformed_fn needs to be provided" ) # load the transform_fn transform_fn = pipeline | transform_fn_io.ReadTransformFn( transform_fn_dir) if evaluation_data is not None: # if evaluation_data exists, apply the transform_fn to the evaluation data raw_eval_data = (pipeline | "ReadEvalData" >> tfrecordio.ReadFromTFRecord( evaluation_data, coder=raw_data_coder)) (transformed_eval_data, transformed_eval_metadata) = ( ((raw_eval_data, raw_data_metadata), transform_fn) | "TransformEvalData" >> beam_impl.TransformDataset()) transformed_eval_coder = ExampleProtoCoder( transformed_eval_metadata.schema) _ = ( # noqa: F841 transformed_eval_data | "WriteTransformedEvalData" >> tfrecordio.WriteToTFRecord( os.path.join(transformed_eval_output_dir, "part"), # noqa: E501 coder=transformed_eval_coder, # noqa: E501 compression_type=compression_type, # noqa: E501 file_name_suffix=".tfrecords")) # noqa: E501 result = pipeline.run().wait_until_finish() return result
def transform_data(shuffled_train_filepattern, shuffled_test_filepattern, transformed_train_filebase, transformed_test_filebase, transformed_metadata_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: shuffled_train_filepattern: Base filename for shuffled training data shards shuffled_test_filepattern: Base filename for shuffled test data shards transformed_train_filebase: Base filename for transformed training data shards transformed_test_filebase: Base filename for transformed test data shards transformed_metadata_dir: Directory where metadata for transformed data should be written """ with beam.Pipeline() as pipeline: with beam_impl.Context(temp_dir=tempfile.mkdtemp()): train_data = (pipeline | 'ReadTrain' >> tfrecordio.ReadFromTFRecord( shuffled_train_filepattern, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema))) test_data = (pipeline | 'ReadTest' >> tfrecordio.ReadFromTFRecord( shuffled_test_filepattern, coder=example_proto_coder.ExampleProtoCoder( RAW_DATA_METADATA.schema))) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_COLUMN] review_tokens = tf.string_split(review, DELIMITERS) review_indices = tft.string_to_int(review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by string_to_int. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_COLUMN: review_bow_indices, REVIEW_WEIGHT: review_weight, LABEL_COLUMN: inputs[LABEL_COLUMN] } (transformed_train_data, transformed_metadata), transform_fn = ( (train_data, RAW_DATA_METADATA) | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn)) transformed_test_data, _ = ( ((test_data, RAW_DATA_METADATA), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) _ = (transformed_train_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( transformed_train_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( transformed_test_filebase, coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema))) _ = (transformed_metadata | 'WriteMetadata' >> beam_metadata_io.WriteMetadata( transformed_metadata_dir, pipeline=pipeline))