コード例 #1
0
def ReadAndShuffleData(pcoll, filepatterns):
  """Read a train or test dataset from disk and shuffle it."""
  # NOTE: we pass filepatterns as a tuple instead of two args, as the current
  # version of beam assumes that if the first arg to a ptransfrom_fn is a
  # string, then that string is the label.
  neg_filepattern, pos_filepattern = filepatterns

  # Read from each file pattern and create a tuple of the review text and the
  # correct label.
  negative_examples = (
      pcoll
      | 'ReadNegativeExamples' >> textio.ReadFromText(neg_filepattern)
      | 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
  positive_examples = (
      pcoll
      | 'ReadPositiveExamples' >> textio.ReadFromText(pos_filepattern)
      | 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
  all_examples = (
      [negative_examples, positive_examples] | 'Merge' >> beam.Flatten())

  # Shuffle the data.  Note that the data does in fact contain duplicate reviews
  # for reasons that are unclear.  This means that NUM_TRAIN_INSTANCES and
  # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
  # pylint: disable=no-value-for-parameter
  shuffled_examples = (
      all_examples
      | 'RemoveDuplicates' >> beam.RemoveDuplicates()
      | 'Shuffle' >> Shuffle())

  # Put the data in the format that can be accepted directly by tf.Transform.
  return shuffled_examples | 'MakeInstances' >> beam.Map(
      lambda p: {REVIEW_COLUMN: p[0], LABEL_COLUMN: p[1]})
コード例 #2
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
コード例 #3
0
def transform_data(train_data_file, test_data_file, working_dir):
  """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """

  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    # Since we are modifying some features and leaving others unchanged, we
    # start by setting `outputs` to a copy of `inputs.
    outputs = inputs.copy()

    # Scale numeric columns to have range [0, 1].
    for key in NUMERIC_FEATURE_KEYS:
      outputs[key] = tft.scale_to_0_1(outputs[key])

    # For all categorical columns except the label column, we generate a
    # vocabulary but do not modify the feature.  This vocabulary is instead
    # used in the trainer, by means of a feature column, to convert the feature
    # from a string to an integer id.
    for key in CATEGORICAL_FEATURE_KEYS:
      tft.vocabulary(inputs[key], vocab_filename=key)

    # For the label column we provide the mapping from string to index.
    table = tf.contrib.lookup.index_table_from_tensor(['>50K', '<=50K'])
    outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])

    return outputs

  # The "with" block will create a pipeline, and run that pipeline at the exit
  # of the block.
  with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
      # Create a coder to read the census data with the schema.  To do this we
      # need to list all columns in order since the schema doesn't specify the
      # order of columns in the csv.
      ordered_columns = [
          'age', 'workclass', 'fnlwgt', 'education', 'education-num',
          'marital-status', 'occupation', 'relationship', 'race', 'sex',
          'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
          'label'
      ]
      converter = tft.coders.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)

      # Read in raw data and convert using CSV converter.  Note that we apply
      # some Beam transformations here, which will not be encoded in the TF
      # graph since we don't do the from within tf.Transform's methods
      # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
      # to get data into a format that the CSV converter can read, in particular
      # removing spaces after commas.
      #
      # We use MapAndFilterErrors instead of Map to filter out decode errors in
      # convert.decode which should only occur for the trailing blank line.
      raw_data = (
          pipeline
          | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
          | 'FixCommasTrainData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'DecodeTrainData' >> MapAndFilterErrors(converter.decode))

      # Combine data and schema into a dataset tuple.  Note that we already used
      # the schema to read the CSV data, but we also need it to interpret
      # raw_data.
      raw_dataset = (raw_data, RAW_DATA_METADATA)
      transformed_dataset, transform_fn = (
          raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
      transformed_data, transformed_metadata = transformed_dataset
      transformed_data_coder = tft.coders.ExampleProtoCoder(
          transformed_metadata.schema)

      _ = (
          transformed_data
          | 'EncodeTrainData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE)))

      # Now apply transform function to test data.  In this case we remove the
      # trailing period at the end of each line, and also ignore the header line
      # that is present in the test data file.
      raw_test_data = (
          pipeline
          | 'ReadTestData' >> textio.ReadFromText(test_data_file,
                                                  skip_header_lines=1)
          | 'FixCommasTestData' >> beam.Map(
              lambda line: line.replace(', ', ','))
          | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1])
          | 'DecodeTestData' >> MapAndFilterErrors(converter.decode))

      raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

      transformed_test_dataset = (
          (raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
      # Don't need transformed data schema, it's the same as before.
      transformed_test_data, _ = transformed_test_dataset

      _ = (
          transformed_test_data
          | 'EncodeTestData' >> beam.Map(transformed_data_coder.encode)
          | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
              os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

      # Will write a SavedModel and metadata to two subdirectories of
      # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
      # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
      _ = (
          transform_fn
          | 'WriteTransformFn' >>
          transform_fn_io.WriteTransformFn(working_dir))
コード例 #4
0
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
コード例 #5
0
ファイル: census_example.py プロジェクト: miturchi/transform
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = tft.CsvCoder(ordered_columns, RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset | tft.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE),
                coder=tft.ExampleProtoCoder(transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | tft.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE),
                coder=tft.ExampleProtoCoder(transformed_metadata.schema))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft.WriteTransformFn(working_dir))
コード例 #6
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
  """
    raw_data_schema = {
        key:
        dataset_schema.ColumnSchema(tf.string, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in CATEGORICAL_COLUMNS
    }
    raw_data_schema.update({
        key:
        dataset_schema.ColumnSchema(tf.float32, [],
                                    dataset_schema.FixedColumnRepresentation())
        for key in NUMERIC_COLUMNS
    })
    raw_data_schema[LABEL_COLUMN] = dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
    raw_data_schema = dataset_schema.Schema(raw_data_schema)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_data_schema)

    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.map(convert_label, inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
コード例 #7
0
def run_transform(output_dir,
                  schema,
                  train_data_file,
                  eval_data_file,
                  project,
                  mode,
                  preprocessing_fn=None):
    """Writes a tft transform fn, and metadata files.
  Args:
    output_dir: output folder
    schema: schema list.
    train_data_file: training data file pattern.
    eval_data_file: eval data file pattern.
    project: the project to run dataflow in.
    local: whether the job should be local or cloud.
    preprocessing_fn: a function used to preprocess the raw data. If not
                      specified, a function will be automatically inferred
                      from the schema.
  """

    tft_input_metadata = make_tft_input_metadata(schema)
    temp_dir = os.path.join(output_dir, 'tmp')
    preprocessing_fn = preprocessing_fn or make_preprocessing_fn(schema)

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-tft-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            temp_dir,
            'project':
            project,
            'extra_packages': [
                'gs://ml-pipeline-playground/tensorflow-transform-0.6.0.dev0.tar.gz'
            ]
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        with beam_impl.Context(temp_dir=temp_dir):
            names = [x['name'] for x in schema]
            converter = CsvCoder(names, tft_input_metadata.schema)
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(train_data_file)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            train_dataset = (train_data, tft_input_metadata)
            transformed_dataset, transform_fn = (
                train_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(output_dir))

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, 'metadata'))

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            eval_data = (p
                         |
                         'ReadEvalData' >> textio.ReadFromText(eval_data_file)
                         | 'DecodeEvalData' >> beam.Map(converter.decode))

            eval_dataset = (eval_data, tft_input_metadata)

            transformed_eval_dataset = ((eval_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            transformed_eval_data, transformed_metadata = transformed_eval_dataset

            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(output_dir, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))
コード例 #8
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()
コード例 #9
0
def create_transform_fn(train_data_file, working_dir):
    """Create a transform function that can be run on-the-fly while training
  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.
  Args:
    train_data_file: File containing training data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        outputs[LABEL_KEY] = inputs[LABEL_KEY]

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the mpg data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            converter = csv_coder.CsvCoder(ordered_columns,
                                           RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))
def transform_data(train_data_file, test_data_file, working_dir,
                   root_train_data_out, root_test_data_out, pipeline_options):
    """Transform the data and write out as a TFRecord of Example protos.
  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.
  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
    root_train_data_out: Root of file containing transform training data
    root_test_data_out: Root of file containing transform test data
    pipeline_options: beam.pipeline.PipelineOptions defining DataFlow options
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # bucketize numeric columns
        for key in TO_BE_BUCKETIZED_FEATURE:
            outputs[key + '_bucketized'] = tft.bucketize(
                inputs[key], TO_BE_BUCKETIZED_FEATURE[key])

        # For categorical columns with a small vocabulary
        for key in STRING_TO_INT_FEATURE_KEYS:
            outputs[key] = tft.string_to_int(inputs[key], vocab_filename=key)

        for key in HASH_STRING_FEATURE_KEYS:
            outputs[key] = tft.hash_strings(inputs[key],
                                            HASH_STRING_FEATURE_KEYS[key])

        # For the label column we transform it either 0 or 1 if there are row leads
        def convert_label(label):
            """Parses a string tensor into the label tensor
      Args:
        label_string_tensor: Tensor of dtype string. Result of parsing the
        CSV column specified by LABEL_COLUMN
      Returns:
        A Tensor of the same shape as label_string_tensor, should return
        an int64 Tensor representing the label index for classification tasks
      """
            table = lookup.index_table_from_tensor(['<=50K', '>50K'])
            return table.lookup(label)

        outputs[LABEL_KEY] = tft.apply_function(convert_label,
                                                inputs[LABEL_KEY])
        return outputs

    def fix_comma_and_filter_third_column(line):
        # to avoid namespace error with DataflowRunner the import of csv is done
        # locacally see https://cloud.google.com/dataflow/faq#how-do-i-handle-nameerrors
        import csv
        cols = list(csv.reader([line], skipinitialspace=True))[0]
        return ','.join(cols[0:2] + cols[3:])

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline(options=pipeline_options) as pipeline:
        tmp_dir = pipeline_options.get_all_options()['temp_location']
        with beam_impl.Context(tmp_dir):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.

            converter = csv_coder.CsvCoder(ORDERED_COLUMNS,
                                           RAW_DATA_METADATA.schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.

            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasAndRemoveFiledTrainData' >>
                        beam.Map(fix_comma_and_filter_third_column)
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, RAW_DATA_METADATA)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, root_train_data_out),
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(lambda line: line)
                | 'FixCommasAndRemoveFiledTestData' >>
                beam.Map(fix_comma_and_filter_third_column)
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, RAW_DATA_METADATA)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                os.path.join(working_dir, root_test_data_out),
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by transform_fn_io.TRANSFORM_FN_DIR and
            # transform_fn_io.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(working_dir))