def build_pipeline(p, flags): """Sets up Apache Beam pipeline for execution.""" raw_data = ( p | 'QueryTable' >> beam.io.Read( beam.io.BigQuerySource(query=query.get_query(flags.bq_table), project=flags.project_id, use_standard_sql=True)) # omit 'Generate data' step if working with real data | 'Generate data' >> beam.Map(_generate_fake_data) | 'Extract lifetime ' >> beam.Map(append_lifetime_duration) | 'Extract label' >> beam.Map(append_label) | 'Generate label array' >> beam.Map(combine_censorship_duration)) raw_train, raw_eval, raw_test = ( raw_data | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_metadata = features.get_raw_dataset_metadata() preprocess_fn = features.preprocess_fn transform_fn = ((raw_train, raw_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn)) (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Test', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, raw_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata( os.path.join(flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) t | write_label >> write_tfrecord(dataset_type, flags.output_dir, metadata)
def preprocess(p, args): """Run preprocessing as pipeline.""" train_eval_schema = _make_input_schema() train_eval_metadata = dataset_metadata.DatasetMetadata( schema=train_eval_schema) _ = (train_eval_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join( args.output_dir, constants.RAW_METADATA_DIR), pipeline=p)) train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read( beam.io.BigQuerySource(query=_get_query('bigquery-public-data', 'samples', 'gsod'), use_standard_sql=True))) train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo( DataValidator()) (transformed_train_eval_data, transformed_train_eval_metadata), transform_fn = ( (train_eval_data, train_eval_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( get_preprocessing_fn())) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) transformed_train_eval_coder = coders.ExampleProtoCoder( transformed_train_eval_metadata.schema) transformed_train_data, transformed_eval_data = ( transformed_train_eval_data | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2)) (transformed_train_data | 'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX)) (transformed_eval_data | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX))
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold, delimiter): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: file paths to input csv files. eval_data: file paths to input csv files. predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. delimiter: the column delimiter for the CSV format. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = criteo.make_input_schema() # 2) Configure the coder to map the source file column names to a dictionary # of key -> tensor_proto with the appropiate type derived from the # input_schema. coder = criteo.make_csv_coder(input_schema, delimiter) # 3) Read from text using the coder. train_data = (pipeline | 'ReadTrainingData' >> beam.io.ReadFromText(training_data) | 'ParseTrainingCsv' >> beam.Map(coder.decode)) evaluate_data = (pipeline | 'ReadEvalData' >> beam.io.ReadFromText(eval_data) | 'ParseEvalCsv' >> beam.Map(coder.decode)) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) # TODO(b/34231369) Remember to eventually also save the statistics. (evaluate_dataset, evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_coder = coders.ExampleProtoCoder(train_metadata.schema) _ = ( train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) _ = ( evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = criteo.make_input_schema(mode=predict_mode) csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = ( pipeline | 'ReadPredictData' >> beam.io.ReadFromText(predict_data) | 'ParsePredictCsv' >> beam.Map(csv_coder.decode) # TODO(b/35194257) Obviate the need for this explicit serialization. | 'EncodePredictData' >> beam.Map(predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join( output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
movie_rating_history=args.movie_rating_history) movies_sideinput = beam.pvalue.AsDict(movies_data) eval_data |= 'BuildEvalFeatures' >> beam.ParDo( BuildExampleFn(args.random_seed), movies_data=movies_sideinput, rating_threshold=args.eval_score_threshold, is_ranking_problem=(args.eval_type == RANKING), is_train=False, num_ranking_candidate_movie_ids=args.num_ranking_candidate_movie_ids) # TFTransform based preprocessing. raw_metadata = dataset_metadata.DatasetMetadata( schema=movielens.make_examples_schema()) _ = (raw_metadata | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata( os.path.join(args.output_dir, 'raw_metadata'), pipeline)) preprocessing_fn = movielens.make_preprocessing_fn() train_features_transformed, transform_fn = ( (train_data, raw_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) eval_features_transformed = ( ((eval_data, raw_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) train_dataset_transformed, train_metadata = train_features_transformed training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema) _ = (train_dataset_transformed | 'EncodeTraining' >> beam.Map(training_coder.encode)
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, frequency_threshold): """Run pre-processing step as a pipeline. Args: pipeline: beam pipeline training_data: the name of the table to train on. eval_data: the name of the table to evaluate on. predict_data: the name of the table to predict on. output_dir: file path to where to write all the output files. frequency_threshold: frequency threshold to use for categorical values. """ # 1) The schema can be either defined in-memory or read from a configuration # file, in this case we are creating the schema in-memory. input_schema = reddit.make_input_schema() # 2) Read from BigQuery or from CSV. train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data) evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data) input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema) _ = (input_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(output_dir, path_constants.RAW_METADATA_DIR), pipeline=pipeline)) preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold) (train_dataset, train_metadata), transform_fn = ( (train_data, input_metadata) | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset( preprocessing_fn)) # WriteTransformFn writes transform_fn and metadata to fixed subdirectories # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and # path_constants.TRANSFORMED_METADATA_DIR. _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)) (evaluate_dataset, evaluate_metadata) = ( ((evaluate_data, input_metadata), transform_fn) | 'TransformEval' >> tft.TransformDataset()) # pylint: disable=expression-not-assigned # TODO(b/34231369) Remember to eventually also save the statistics and the # metadata. train_coder = coders.ExampleProtoCoder(train_metadata.schema) (train_dataset | 'SerializeTrainExamples' >> beam.Map(train_coder.encode) | 'ShuffleTraining' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteTraining' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema) (evaluate_dataset | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode) | 'ShuffleEval' >> _Shuffle() # pylint: disable=no-value-for-parameter | 'WriteEval' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) if predict_data: predict_mode = tf.contrib.learn.ModeKeys.INFER predict_schema = reddit.make_input_schema(mode=predict_mode) predict_coder = coders.ExampleProtoCoder(predict_schema) serialized_examples = (pipeline | 'ReadPredictData' >> _ReadData( predict_data, mode=predict_mode) # TODO(b/35194257) Obviate the need for this explicit # serialization. | 'EncodePredictData' >> beam.Map( predict_coder.encode)) _ = (serialized_examples | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.tfrecord.gz')) _ = (serialized_examples | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json) | 'WritePredictDataAsText' >> beam.io.WriteToText( os.path.join(output_dir, path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX), file_name_suffix='.txt'))
def preprocess(in_test_mode): import os import os.path import tempfile from apache_beam.io import tfrecordio from tensorflow_transform.coders import example_proto_coder from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.beam import tft_beam_io from tensorflow_transform.beam.tft_beam_io import transform_fn_io job_name = 'preprocess-taxi-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S') if in_test_mode: import shutil print 'Launching local job ... hang on' OUTPUT_DIR = './preproc_tft' shutil.rmtree(OUTPUT_DIR, ignore_errors=True) EVERY_N = 100000 else: print 'Launching Dataflow job {} ... hang on'.format(job_name) OUTPUT_DIR = 'gs://{0}/taxifare/preproc_tft/'.format(BUCKET) import subprocess subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split()) EVERY_N = 10000 options = { 'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'), 'temp_location': os.path.join(OUTPUT_DIR, 'tmp'), 'job_name': job_name, 'project': PROJECT, 'max_num_workers': 24, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True, 'requirements_file': 'requirements.txt' } opts = beam.pipeline.PipelineOptions(flags=[], **options) if in_test_mode: RUNNER = 'DirectRunner' else: RUNNER = 'DataflowRunner' # set up metadata raw_data_schema = { colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()) for colname in 'dayofweek,key'.split(',') } raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation()) for colname in 'fare_amount,pickuplon,pickuplat,dropofflon,dropofflat'.split(',') }) raw_data_schema.update({ colname : dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()) for colname in 'hourofday,passengers'.split(',') }) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema)) # run Beam with beam.Pipeline(RUNNER, options=opts) as p: with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')): # save the raw data metadata _ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata( os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'), pipeline=p)) # analyze and transform training raw_data = (p | 'train_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(1, EVERY_N), use_standard_sql=True)) | 'train_filter' >> beam.Filter(is_valid)) raw_dataset = (raw_data, raw_data_metadata) transformed_dataset, transform_fn = ( raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft)) transformed_data, transformed_metadata = transformed_dataset _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'train'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) # transform eval data raw_test_data = (p | 'eval_read' >> beam.io.Read(beam.io.BigQuerySource(query=create_query(2, EVERY_N), use_standard_sql=True)) | 'eval_filter' >> beam.Filter(is_valid)) raw_test_dataset = (raw_test_data, raw_data_metadata) transformed_test_dataset = ( (raw_test_dataset, transform_fn) | beam_impl.TransformDataset()) transformed_test_data, _ = transformed_test_dataset _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord( os.path.join(OUTPUT_DIR, 'eval'), file_name_suffix='.gz', coder=example_proto_coder.ExampleProtoCoder( transformed_metadata.schema)) _ = (transform_fn | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
def run(flags, pipeline_args): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis""" options = PipelineOptions(flags=[], **pipeline_args) options.view_as(WorkerOptions).machine_type = flags.machine_type temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' files = tf.gfile.Glob(flags.input_dir + "*") if not flags.cloud: files = files[0: 20] # if running locally for testing, process less files logging.warning("Number of files: " + str(len(files))) labels = get_labels_array( "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv" ) with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC)) filenames = (p | 'Create filenames' >> beam.Create(files)) nii = (filenames | 'Read NII' >> beam.Map(read_nii)) nii_with_labels = ( nii | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels))) raw_train, raw_eval, raw_test = ( nii_with_labels | 'RandomlySplitData' >> randomly_split( train_size=.7, validation_size=.15, test_size=.15)) raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap( lambda x: x[1]) raw_eval = (raw_eval | 'FlattenEval' >> beam.FlatMap(lambda x: x[1])) raw_test = (raw_test | 'FlattenTest' >> beam.FlatMap(lambda x: x[1])) raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir) dataset_and_metadata, transform_fn = ( (raw_train, input_metadata) | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset( features.preprocess)) transform_fn = ( (raw_train, input_metadata) | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess)) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn( flags.output_dir)) for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval), ('Predict', raw_test)]: transform_label = 'Transform{}'.format(dataset_type) t, metadata = (((dataset, input_metadata), transform_fn) | transform_label >> tft_beam.TransformDataset()) if dataset_type == 'Train': _ = (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(os.path.join( flags.output_dir, 'transformed_metadata'), pipeline=p)) write_label = 'Write{}TFRecord'.format(dataset_type) _ = t | write_label >> WriteTFRecord( dataset_type, flags.output_dir, metadata)