def preprocess(pipeline, training_data, eval_data, predict_data, output_dir): """Read in input files, runs ml.Preprocess, and writes preprocessed output. Args: pipeline: beam pipeline training_data, eval_data, predict_data: file paths to input csv files. output_dir: file path to where to write all the output files. Returns: metadata and preprocessed features as pcollections. """ feature_set = iris.IrisFeatures() coder_with_target = io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns) coder_without_target = io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns, has_target_columns=False) train = ( pipeline | 'ReadTrainingData' >> beam.io.textio.ReadFromText(training_data, coder=coder_with_target)) evaluate = ( pipeline | 'ReadEvalData' >> beam.io.textio.ReadFromText(eval_data, coder=coder_with_target)) predict = ( pipeline | 'ReadPredictData' >> beam.io.textio.ReadFromText(predict_data, coder=coder_without_target)) # TODO(b/32726166) Update input_format and format_metadata to read from these # values directly from the coder. (metadata, train_features, eval_features, predict_features) = ( (train, evaluate, predict) | 'Preprocess' >> ml.Preprocess( feature_set, input_format='csv', format_metadata={ 'headers': feature_set.csv_columns })) # Writes metadata.json - specified through METADATA_FILENAME- (text file), # features_train, features_eval, and features_eval (TFRecord files) (metadata | 'SaveMetadata' >> io.SaveMetadata(os.path.join(output_dir, METADATA_FILE_NAME))) # We turn off sharding of the feature files because the dataset is very small. (train_features | 'SaveTrain' >> io.SaveFeatures(os.path.join(output_dir, 'features_train'))) (eval_features | 'SaveEval' >> io.SaveFeatures(os.path.join(output_dir, 'features_eval'))) (predict_features | 'SavePredict' >> io.SaveFeatures(os.path.join(output_dir, 'features_predict'))) return metadata, train_features, eval_features, predict_features
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir, metadata_file_name): feature_set = iris.IrisFeatures() read_training_data = beam.io.ReadFromText( training_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns)) read_eval_data = beam.io.ReadFromText(eval_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set( feature_set, feature_set.csv_columns)) read_predict_data = beam.io.ReadFromText( predict_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns, has_target_columns=False)) train = pipeline | 'ReadTrainingData' >> read_training_data evaluate = pipeline | 'ReadEvalData' >> read_eval_data predict = pipeline | 'ReadPredictData' >> read_predict_data # TODO(b/32726166) Update input_format and format_metadata to read from these # values directly from the coder. (metadata, train_features, eval_features, predict_features) = ( (train, evaluate, predict) | 'Preprocess' >> ml.Preprocess( feature_set, input_format='csv', format_metadata={'headers': feature_set.csv_columns})) # pylint: disable=expression-not-assigned (metadata | 'SaveMetadata' >> io.SaveMetadata( os.path.join(output_dir, metadata_file_name))) # We turn off sharding of these feature files because the dataset very small. (train_features | 'SaveTrain' >> io.SaveFeatures( os.path.join(output_dir, 'features_train'))) (eval_features | 'SaveEval' >> io.SaveFeatures(os.path.join(output_dir, 'features_eval'))) (predict_features | 'SavePredict' >> io.SaveFeatures( os.path.join(output_dir, 'features_predict'))) # pylint: enable=expression-not-assigned return metadata, train_features, eval_features, predict_features
def preprocess(pipeline): feature_set = iris.IrisFeatures() training_data = beam.io.TextFileSource(args.training_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set( feature_set, feature_set.csv_columns)) eval_data = beam.io.TextFileSource(args.eval_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set( feature_set, feature_set.csv_columns)) predict_data = beam.io.TextFileSource(args.predict_data, strip_trailing_newlines=True, coder=io.CsvCoder.from_feature_set( feature_set, feature_set.csv_columns, has_target_columns=False)) train = pipeline | beam.Read('ReadTrainingData', training_data) evaluate = pipeline | beam.Read('ReadEvalData', eval_data) predict = pipeline | beam.Read('ReadPredictData', predict_data) (metadata, train_features, eval_features, predict_features) = ( (train, evaluate, predict) | 'Preprocess' >> ml.Preprocess( feature_set, input_format='csv', format_metadata={'headers': feature_set.csv_columns})) # Writes metadata.yaml, features_train, features_eval, and features_eval files # pylint: disable=expression-not-assigned (metadata | 'SaveMetadata' >> io.SaveMetadata( os.path.join(args.output_dir, 'metadata.yaml'))) # We turn off sharding of these feature files because the dataset very small. (train_features | 'SaveTrain' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_train'))) (eval_features | 'SaveEval' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_eval'))) (predict_features | 'SavePredict' >> io.SaveFeatures( os.path.join(args.output_dir, 'features_predict'))) # pylint: enable=expression-not-assigned return metadata, train_features, eval_features, predict_features