def main(): args = parse_args() pipeline_options = PipelineOptions(**vars(args)) pipeline = beam.Pipeline(options=pipeline_options) train_files = glob.glob("./mnist_images/train" + os.sep + "*.jpg") eval_files = glob.glob("./mnist_images/eval" + os.sep + "*.jpg") _ = ( pipeline | 'ListTrainFiles' >> beam.Create(train_files) | 'TrainReadFiles' >> beam.Map(lambda path: (read_from_path(path))) | 'WriteToTrainTfrecord' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=path.join("mnist_tfrecords", "train", "train"), compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED, coder=coders.ExampleProtoCoder(tfrecord_schema()), file_name_suffix='.tfrecord')) _ = ( pipeline | 'ListEvalFiles' >> beam.Create(eval_files) | 'EvalReadFiles' >> beam.Map(lambda path: (read_from_path(path))) | 'WriteToEvalTfrecord' >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=path.join("mnist_tfrecords", "eval", "eval"), compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED, coder=coders.ExampleProtoCoder(tfrecord_schema()), file_name_suffix='.tfrecord')) pipeline.run().wait_until_finish()
def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz'))
def process(self, element): element_spec = self._feature_spec.copy() for identity in self._optional_field_names: if identity not in element: del element_spec[identity] element_schema = Schema(element_spec) coder = coders.ExampleProtoCoder(element_schema) encoded_element = coder.encode(element) yield encoded_element
def main(argv=None): '''Run Preprocessing as a Dataflow pipeline.''' args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: logging.info('Start running in the cloud') options = { 'runner': 'DataflowRunner', 'job_name': ('mlengine-boilerplate-{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'staging_location': os.path.join(args.output_dir, 'staging'), 'temp_location': os.path.join(args.output_dir, 'tmp'), 'project': args.project_id, 'zone': 'europe-west1-d', 'autoscaling_algorithm': 'THROUGHPUT_BASED', 'save_main_session': True, 'setup_file': './setup.py', } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) print(pipeline_options) else: pipeline_options = None train_coder = coders.ExampleProtoCoder(schema) p = beam.Pipeline(options=pipeline_options) examples = (p | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*', skip_header_lines=1) | 'buildExamples' >> beam.FlatMap(lambda raw_input: buildExample(raw_input))) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } for part, examples in example_dict.items(): _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.output_dir, part + '_examples'), compression_type=filesystem.CompressionTypes.GZIP, coder=train_coder, file_name_suffix='.gz') p.run()
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ logging.info('running main') args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options(args.project_id, args.output_dir) else: pipeline_options = None pipeline = beam.Pipeline(options=pipeline_options) all_labels = (pipeline | 'ReadDictionary' >> beam.io.ReadFromText( 'gs://cloud-ml-data/img/flower_photos/dict.txt', strip_trailing_newlines=True)) examples = (pipeline | 'ReadData' >> beam.io.ReadFromText( 'gs://cloud-ml-data/img/flower_photos/train_set.csv', strip_trailing_newlines=True) | 'Split' >> beam.FlatMap(select_files) | 'OneHotEncoding' >> beam.FlatMap( one_hot_encoding, beam.pvalue.AsIter(all_labels)) | 'ReadImage' >> beam.FlatMap(process_image) | 'BuildExamples' >> beam.FlatMap(build_example)) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } train_coder = coders.ExampleProtoCoder(schema) for part, examples in example_dict.items(): examples | part + '_writeExamples' >> \ beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join( args.output_dir, part + '_examples'), compression_type=beam.io.filesystem.CompressionTypes.GZIP, coder=train_coder, file_name_suffix='.tfrecord.gz') logging.info('running pipeline') pipeline.run().wait_until_finish()
def preprocess(p, args): """Run preprocessing as pipeline.""" train_eval_schema = _make_input_schema() train_eval_metadata = dataset_metadata.DatasetMetadata( schema=train_eval_schema) _ = (train_eval_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join( args.output_dir, constants.RAW_METADATA_DIR), pipeline=p)) train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read( beam.io.BigQuerySource(query=_get_query('bigquery-public-data', 'samples', 'gsod'), use_standard_sql=True))) train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo( DataValidator()) (transformed_train_eval_data, transformed_train_eval_metadata), transform_fn = ( (train_eval_data, train_eval_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( get_preprocessing_fn())) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) transformed_train_eval_coder = coders.ExampleProtoCoder( transformed_train_eval_metadata.schema) transformed_train_data, transformed_eval_data = ( transformed_train_eval_data | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2)) (transformed_train_data | 'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX)) (transformed_eval_data | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX))
def write_tfrecord(p, prefix, output_dir, metadata): """Shuffles and write the given pCollection as a TFRecord. Args: p: a pCollection. prefix: prefix for location tf-record will be written to. output_dir: the directory or bucket to write the json data. metadata: metadata of input data from tft_beam.TransformDataset(...) """ coder = coders.ExampleProtoCoder(metadata.schema) prefix = str(prefix).lower() (p | 'ShuffleData' >> shuffle() | 'WriteTFRecord' >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'data', prefix, prefix), coder=coder, file_name_suffix='.tfrecord'))
def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) #coder = criteo.make_csv_coder(input_schema, delimiter) #coder = coders.ExampleProtoCoder(metadata.schema) column_names = ['clicked'] #for name in INTEGER_COLUMN_NAMES: # column_names.append(name) #for name in CATEGORICAL_COLUMN_NAMES: # column_names.append(name) #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",") coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz'))
def __init__(self, feature_spec, optional_field_names, rule_optional_fn=lambda x: x < 0): """Initialises a TF-Record decoder. Args: feature_spec: Dictionary from feature names to one of `FixedLenFeature`, `SparseFeature` or `VarLenFeature. It contains all the features to parse (including optional ones). optional_field_names: list of optional fields. rule_optional_fn: function that take the value of an optional field and returns True if the value is indicative of a default value (e.g. resulting from the default value of parsing FixedLenFeature). Current code requires that all optional_field_names share the rule_optional_fn. """ self._schema = Schema(feature_spec) self._coder = coders.ExampleProtoCoder(self._schema) self._optional_field_names = optional_field_names self._rule_optional_fn = rule_optional_fn
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options(args.project_id, args.output_dir) else: pipeline_options = None pipeline = beam.Pipeline(options=pipeline_options) examples = ( pipeline # | 'ReadData' >> beam.Create(open('data/test.csv') # .readlines()[1:]) | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '*', skip_header_lines=1) | 'BuildExamples' >> beam.FlatMap(build_example)) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } for part, examples in example_dict.items(): examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(TFRECORD_DIR, part + '_examples'), compression_type=filesystem.CompressionTypes.GZIP, coder=coders.ExampleProtoCoder(schema), file_name_suffix='.tfrecord.gz') pipeline.run().wait_until_finish()
def WriteOutput(p, prefix, output_dir, feature_spec, plain_text=False): """Writes the given pCollection as a TF-Record. Args: p: a pCollection. prefix: prefix for location tf-record will be written to. output_dir: the directory or bucket to write the json data. feature_spec: the feature spec of the tf-record to be written. plain_text: if true, write the output as plain text instead. """ path = os.path.join(output_dir, prefix) shuffled = p | "ShuffleData" >> Shuffle() # pylint: disable=no-value-for-parameter if plain_text: shuffled | "WriteToText" >> beam.io.WriteToText( path, file_name_suffix=".txt") return schema = dataset_schema.from_feature_spec(feature_spec) coder = coders.ExampleProtoCoder(schema) shuffled | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord( path, coder=coder, file_name_suffix=".tfrecord")
def WriteTFRecord(p, prefix, output_dir, metadata): """Shuffles and write the given pCollection as a TF-Record. Args: p: a pCollection. prefix: prefix for location tf-record will be written to. output_dir: the directory or bucket to write the json data. metadata """ coder = coders.ExampleProtoCoder(metadata.schema) prefix = str(prefix).lower() out_dir = os.path.join(output_dir, 'data', prefix, prefix) # Examples are large, so we should ensure the TFRecords are relatively small num_shards = 60 if prefix == 'train' else 20 logging.warning("writing TFrecords to " + out_dir) _ = ( p | "ShuffleData" >> shuffle() # pylint: disable=no-value-for-parameter | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(output_dir, 'data', prefix, prefix), coder=coder, num_shards=num_shards, file_name_suffix=".tfrecord"))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold, delimiter): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. delimiter: the column delimiter for the CSV format. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_csv_coder(input_schema, delimiter) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) transform_fn = ((train_data, input_metadata) | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) @beam.ptransform_fn def TransformAndWrite(pcoll, path): # pylint: disable=invalid-name pcoll |= 'Shuffle' >> _Shuffle() # pylint: disable=no-value-for-parameter (dataset, metadata) = (((pcoll, input_metadata), transform_fn) | 'Transform' >> tft.TransformDataset()) #coder = criteo.make_csv_coder(input_schema, delimiter) #coder = coders.ExampleProtoCoder(metadata.schema) column_names = ['clicked'] #for name in INTEGER_COLUMN_NAMES: # column_names.append(name) #for name in CATEGORICAL_COLUMN_NAMES: # column_names.append(name) #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",") coder = coders.ExampleProtoCoder(metadata.schema) _ = (dataset | 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX) _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX) # TODO(b/35300113) Remember to eventually also save the statistics. if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(csv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_tsv_coder(input_schema) # 3) Read from text using the coder. train_data = ( pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = ( pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) # TODO(b/33688220) should the transform functions take shuffle as an optional # argument? # TODO(b/33688275) Should the transform functions have more user friendly # names? preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir)) # TODO(b/34231369) Remember to eventually also save the statistics. (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_coder = coders.ExampleProtoCoder(train_metadata.schema) _ = (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) _ = (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess(pipeline, args): input_metadata = metadata_io.read_metadata( os.path.join(args.analyze_output_dir, RAW_METADATA_DIR)) schema = json.loads(file_io.read_file_to_string( os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode()) features = json.loads(file_io.read_file_to_string( os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode()) column_names = [col['name'] for col in schema] exclude_outputs = None if not args.target: for name, transform in six.iteritems(features): if transform['transform'] == TARGET_TRANSFORM: target_name = name column_names.remove(target_name) exclude_outputs = [target_name] del input_metadata.schema.column_schemas[target_name] break if args.csv_file_pattern: coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',') raw_data = ( pipeline | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern) | 'ParseCsvData' >> beam.Map(coder.decode)) else: columns = ', '.join(column_names) query = 'SELECT {columns} FROM `{table}`'.format(columns=columns, table=args.bigquery_table) raw_data = ( pipeline | 'ReadBiqQueryData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))) # Note that prepare_image_transforms does not make embeddints, it justs reads # the image files and converts them to base64 stings. tft.TransformDataset() # will apply the saved model that makes the image embeddings. image_columns = image_transform_columns(features) raw_data = ( raw_data | 'PreprocessTransferredLearningTransformations' >> beam.Map(prepare_image_transforms, image_columns)) if args.shuffle: raw_data = raw_data | 'ShuffleData' >> shuffle() transform_fn = ( pipeline | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn(args.analyze_output_dir)) (transformed_data, transform_metadata) = ( ((raw_data, input_metadata), transform_fn) | 'ApplyTensorflowPreprocessingGraph' >> tft.TransformDataset(exclude_outputs)) tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(tfexample_coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, args.output_filename_prefix), file_name_suffix='.tfrecord.gz'))
def _make_proto_coder(schema): raw_feature_spec = _get_raw_feature_spec(schema) raw_schema = schema_utils.schema_from_feature_spec(raw_feature_spec) return tft_coders.ExampleProtoCoder(raw_schema)
def _make_proto_coder(schema): raw_feature_spec = _get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.ExampleProtoCoder(raw_schema)
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ work_dir = os.path.join(output_dir, path_constants.TEMP_DIR) # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) # TODO(b/33688220) should the transform functions take shuffle as an optional # argument? # TODO(b/33688275) Should the transform functions have more user friendly # names? input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn, work_dir)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released. def encode_predict_data(d): try: return predict_coder.encode(d) except Exception: # pylint: disable=broad-except # Compatibility path for tf.transform < 0.1.5 return predict_coder.encode({ k: v.encode('utf-8') if isinstance(v, unicode) else v for k, v in d.items() }) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( encode_predict_data)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
_ = (raw_metadata | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'raw_metadata'), pipeline)) preprocessing_fn = movielens.make_preprocessing_fn() train_features_transformed, transform_fn = ( (train_data, raw_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) eval_features_transformed = ( ((eval_data, raw_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_dataset_transformed, train_metadata = train_features_transformed training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema) _ = (train_dataset_transformed | 'EncodeTraining' >> beam.Map(training_coder.encode) | 'ShuffleTraining' >> ( _Shuffle()) # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, 'features_train'), file_name_suffix='.tfrecord.gz')) _ = (train_metadata | 'WriteTransformedMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'transformed_metadata'), pipeline)) eval_dataset_transformed, eval_metadata = eval_features_transformed eval_coder = tft_coders.ExampleProtoCoder(eval_metadata.schema) prediction_schema = movielens.make_prediction_schema()
| 'SerializeExamples' >> beam.Map(coder.encode) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, path), file_name_suffix='.tfrecord.gz')) _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite( # pylint: disable=no-value-for-parameter 'features_train') _ = eval_data | 'TransformAndWriteEval' >> TransformAndWrite( # pylint: disable=no-value-for-parameter 'features_eval') # TODO(b/35300113) Remember to eventually also save the statistics. # Save files for online and batch prediction. prediction_schema = movielens.make_prediction_schema() prediction_coder = tft_coders.ExampleProtoCoder(prediction_schema) prediction_data = ( eval_data | 'EncodePrediction' >> beam.Map(prediction_coder.encode)) _ = (prediction_data | 'EncodePredictionAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(args.output_dir, 'features_predict'), file_name_suffix='.txt')) _ = (prediction_data | 'WritePredictDataAsTfRecord' >> beam.io.WriteToTFRecord( os.path.join(args.output_dir, 'features_predict'), file_name_suffix='.tfrecord.gz')) def _encode_as_b64_json(serialized_example):
for key in NUMERIC_FEATURE_KEYS }) feature_spec.update({ key: tf.io.FixedLenFeature([], tf.int64) for key in NUMERIC_FEATURE_KEYS_INT }) raw_data_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec)) return raw_data_metadata RAW_DATA_METADATA = _create_raw_metadata() csv_coder_ = csv_coder.CsvCoder(ORDERED_COLUMNS, RAW_DATA_METADATA.schema) proto_coder = tft_coders.ExampleProtoCoder(RAW_DATA_METADATA.schema) def cus_input(one_line): one_example = csv_coder_.decode(one_line) serialized_example = proto_coder.encode(one_example) json_example = {"inputs": { "b64": base64.b64encode(serialized_example).decode()}} return json_example def predict_json(request): """ You need to headcode project, model and version """ project = 'eeeooosss'
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))