Ejemplo n.º 1
0
def build_pipeline(p, flags):
    """Sets up Apache Beam pipeline for execution."""

    raw_data = (
        p | 'QueryTable' >> beam.io.Read(
            beam.io.BigQuerySource(query=query.get_query(flags.bq_table),
                                   project=flags.project_id,
                                   use_standard_sql=True))
        # omit 'Generate data' step if working with real data
        | 'Generate data' >> beam.Map(_generate_fake_data)
        | 'Extract lifetime ' >> beam.Map(append_lifetime_duration)
        | 'Extract label' >> beam.Map(append_label)
        | 'Generate label array' >> beam.Map(combine_censorship_duration))
    raw_train, raw_eval, raw_test = (
        raw_data | 'RandomlySplitData' >> randomly_split(
            train_size=.7, validation_size=.15, test_size=.15))
    raw_metadata = features.get_raw_dataset_metadata()
    preprocess_fn = features.preprocess_fn
    transform_fn = ((raw_train, raw_metadata)
                    | 'AnalyzeTrain' >> tft_beam.AnalyzeDataset(preprocess_fn))
    (transform_fn
     | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(flags.output_dir))

    for dataset_type, dataset in [('Train', raw_train), ('Eval', raw_eval),
                                  ('Test', raw_test)]:
        transform_label = 'Transform{}'.format(dataset_type)
        t, metadata = (((dataset, raw_metadata), transform_fn)
                       | transform_label >> tft_beam.TransformDataset())
        if dataset_type == 'Train':
            (metadata | 'WriteMetadata' >> tft_beam_io.WriteMetadata(
                os.path.join(flags.output_dir, 'transformed_metadata'),
                pipeline=p))
        write_label = 'Write{}TFRecord'.format(dataset_type)
        t | write_label >> write_tfrecord(dataset_type, flags.output_dir,
                                          metadata)
Ejemplo n.º 2
0
def preprocess(p, args):
    """Run preprocessing as pipeline."""
    train_eval_schema = _make_input_schema()

    train_eval_metadata = dataset_metadata.DatasetMetadata(
        schema=train_eval_schema)

    _ = (train_eval_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(
             args.output_dir, constants.RAW_METADATA_DIR),
                                                             pipeline=p))

    train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read(
        beam.io.BigQuerySource(query=_get_query('bigquery-public-data',
                                                'samples', 'gsod'),
                               use_standard_sql=True)))

    train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo(
        DataValidator())

    (transformed_train_eval_data,
     transformed_train_eval_metadata), transform_fn = (
         (train_eval_data, train_eval_metadata)
         | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
             get_preprocessing_fn()))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

    transformed_train_eval_coder = coders.ExampleProtoCoder(
        transformed_train_eval_metadata.schema)

    transformed_train_data, transformed_eval_data = (
        transformed_train_eval_data
        | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2))

    (transformed_train_data
     |
     'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))

    (transformed_eval_data
     | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteEval' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))
Ejemplo n.º 3
0
def make_transform_graph(output_dir, schema, features):
    """Writes a tft transform fn, and metadata files.

  Args:
    output_dir: output folder
    schema: schema list
    features: features dict
  """

    tft_input_schema = make_tft_input_schema(
        schema, os.path.join(output_dir, STATS_FILE))
    tft_input_metadata = dataset_metadata.DatasetMetadata(
        schema=tft_input_schema)
    preprocessing_fn = make_preprocessing_fn(output_dir, features)

    # preprocessing_fn does not use any analyzer, so we can run a local beam job
    # to properly make and write the transform function.
    temp_dir = os.path.join(output_dir, 'tmp')
    with beam.Pipeline('DirectRunner', options=None) as p:
        with tft_impl.Context(temp_dir=temp_dir):

            # Not going to transform, so no data is needed.
            train_data = p | beam.Create([])

            transform_fn = (
                (train_data, tft_input_metadata)
                | 'BuildTransformFn'  # noqa
                >> tft_impl.AnalyzeDataset(preprocessing_fn))  # noqa

            # Writes transformed_metadata and transfrom_fn folders
            _ = (transform_fn |
                 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir)
                 )  # noqa

            # Write the raw_metadata
            metadata_io.write_metadata(metadata=tft_input_metadata,
                                       path=os.path.join(
                                           output_dir, RAW_METADATA_DIR))
  def _create_test_data(self):
    """Makes local test data.

    The fllowing files and folders will be created in self.output_folder:

    self.output_folder/
        features.json
        img.png
        input.csv
        schema.json
        raw_metadata/
            (tft metadata files)
        transformed_metadata/
            (tft metadata files)
        transform_fn/
            (tft saved model file)
    """
    self.output_folder = tempfile.mkdtemp()

    # Make image file
    self.img_filepath = os.path.join(self.output_folder, 'img.png')
    image = Image.new('RGBA', size=(50, 50), color=(155, 0, 0))
    image.save(self.img_filepath, 'png')

    # Make csv input file
    self.csv_input_filepath = os.path.join(self.output_folder, 'input.csv')
    file_io.write_string_to_file(
        self.csv_input_filepath,
        '23.0,%s' % self.img_filepath)

    # Make schema file
    self.schema_filepath = os.path.join(self.output_folder, 'schema.json')
    file_io.write_string_to_file(
        self.schema_filepath,
        json.dumps([{'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}]))

    # Make features file
    self.features_filepath = os.path.join(self.output_folder, 'features.json')
    file_io.write_string_to_file(
        self.features_filepath,
        json.dumps({'num_col': {'transform': 'target'},
                    'img_col': {'transform': 'img_url_to_vec'}}))

    # Run a local beam job to make the transform_fn
    with beam.Pipeline('DirectRunner'):
      with tft_impl.Context(temp_dir=os.path.join(self.output_folder, 'tmp')):
        def preprocessing_fn(inputs):
          return {'img_col': tft.map(tf.decode_base64, inputs['img_col']),
                  'num_col': tft.map(lambda x: tf.add(x, 1), inputs['num_col'])}

        input_data = [{'img_col': base64.urlsafe_b64encode('abcd'), 'num_col': 3}]

        input_metadata = dataset_metadata.DatasetMetadata(
            schema=dataset_schema.from_feature_spec(
                {'img_col': tf.FixedLenFeature(shape=[], dtype=tf.string),
                 'num_col': tf.FixedLenFeature(shape=[], dtype=tf.float32)}))

        (dataset, train_metadata), transform_fn = (
            (input_data, input_metadata)
            | 'AnalyzeAndTransform'  # noqa: W503
            >> tft_impl.AnalyzeAndTransformDataset(preprocessing_fn))  # noqa: W503

        # WriteTransformFn writes transform_fn and metadata
        _ = (transform_fn  # noqa: F841
             | 'WriteTransformFn'  # noqa: W503
             >> tft_beam_io.WriteTransformFn(self.output_folder))  # noqa: W503

        metadata_io.write_metadata(
            metadata=input_metadata,
            path=os.path.join(self.output_folder, 'raw_metadata'))
Ejemplo n.º 5
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold, delimiter):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
    delimiter: the column delimiter for the CSV format.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_csv_coder(input_schema, delimiter)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    (train_dataset, train_metadata), transform_fn = (
        (train_data, input_metadata)
        | 'AnalyzeAndTransform' >>
        tft.AnalyzeAndTransformDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    # TODO(b/34231369) Remember to eventually also save the statistics.

    (evaluate_dataset,
     evaluate_metadata) = (((evaluate_data, input_metadata), transform_fn)
                           | 'TransformEval' >> tft.TransformDataset())

    train_coder = coders.ExampleProtoCoder(train_metadata.schema)
    _ = (
        train_dataset
        | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
        | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteTraining' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
    _ = (
        evaluate_dataset
        | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
        | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteEval' >>
        beam.io.WriteToTFRecord(os.path.join(
            output_dir, path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                                file_name_suffix='.tfrecord.gz'))

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(csv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Ejemplo n.º 6
0
  _ = (eval_dataset_transformed
       | 'EncodeEval' >> beam.Map(eval_coder.encode)
       | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
       | 'WriteEval' >> beam.io.WriteToTFRecord(
           os.path.join(args.output_dir, 'features_eval'),
           file_name_suffix='.tfrecord.gz'))
  _ = (eval_data
       | 'EncodePrediction' >> beam.Map(prediction_coder.encode)
       | 'EncodeEvalAsB64Json' >> beam.Map(_encode_as_b64_json)
       | 'WritePredictDataAsText' >> beam.io.WriteToText(
           os.path.join(args.output_dir, 'features_eval'),
           file_name_suffix='.txt'))

  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))


def _encode_as_b64_json(serialized_example):
  import base64  # pylint: disable=g-import-not-at-top
  import json  # pylint: disable=g-import-not-at-top
  return json.dumps({'b64': base64.b64encode(serialized_example)})


def get_pipeline_name(runner, cloud):
  # Allow users to use cutom runner.
  if runner:
    return runner
  if cloud:
    return 'DataflowRunner'
  else:
Ejemplo n.º 7
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
Ejemplo n.º 8
0
def preprocess_data(train_neg_file_pattern,
                    train_pos_file_pattern,
                    test_neg_file_pattern,
                    test_pos_file_pattern,
                    transformed_train_file_pattern,
                    transformed_test_file_pattern,
                    transformed_metadata_dir,
                    raw_metadata_dir,
                    transform_func_dir,
                    temp_dir,
                    vocab_size,
                    delimiters):
    """Transform the data and write out as a TFRecord of Example protos.
    Read in the data from the positive and negative examples on disk, and
    transform it using a preprocessing pipeline that removes punctuation,
    tokenizes and maps tokens to int64 values indices.

    Args:
    train_neg_filepattern: Filepattern for training data negative examples
    train_pos_filepattern: Filepattern for training data positive examples
    test_neg_filepattern: Filepattern for test data negative examples
    test_pos_filepattern: Filepattern for test data positive examples
    transformed_train_filebase: Base filename for transformed training data shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data should be written


    raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
        REVIEW_COLUMN: dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation()),
        LABEL_COLUMN: dataset_schema.ColumnSchema(tf.int64, [], dataset_schema.FixedColumnRepresentation()),
    }))
    """
    pipeline_name = 'DataflowRunner'
    options = {
        'job_name': ('cloud-ml-hazmat-preprocess-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
        'temp_location': temp_dir,
        'project': "stone-outpost-636",
        'max_num_workers': 8
    }
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    #with beam.Pipeline(pipeline_name, options=pipeline_options) as pipeline:
    #    with beam_impl.Context(temp_dir=temp_dir):
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):

            train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData((train_neg_file_pattern, train_pos_file_pattern))
            test_data = pipeline | 'ReadTest' >> ReadAndShuffleData((test_neg_file_pattern, test_pos_file_pattern))
            preprocessing_fn = generate_preprocessing_fn(vocab_size, delimiters)

            (transformed_train_data, transformed_metadata), transform_fn = ((train_data, const.RAW_METADATA)
              | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(transform_func_dir))

            transformed_test_data, _ = (((test_data, const.RAW_METADATA), transform_fn)
              | 'Transform' >> beam_impl.TransformDataset())

            _ = (transformed_train_data
              | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(transformed_train_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_test_data
              | 'WriteTestData' >> tfrecordio.WriteToTFRecord(transformed_test_file_pattern,
                  coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))

            _ = (transformed_metadata
              | 'WriteTransformedMetadata' >> beam_metadata_io.WriteMetadata(transformed_metadata_dir, pipeline=pipeline))

            _ = (const.RAW_METADATA
              | 'WriteRawMetadata' >> beam_metadata_io.WriteMetadata(raw_metadata_dir, pipeline=pipeline))
Ejemplo n.º 9
0
def transform_data(train_data_file, test_data_file, transformed_train_filebase,
                   transformed_test_filebase, transformed_metadata_dir,
                   transform_graph_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and coverts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    transformed_train_filebase: Base filename for transformed training data
        shards
    transformed_test_filebase: Base filename for transformed test data shards
    transformed_metadata_dir: Directory where metadata for transformed data
        should be written
    transform_graph_dir: dir where the beam tf graph should be written
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_COLUMNS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        # For all categorical columns except the label column, we use
        # tft.string_to_int which computes the set of unique values and uses this
        # to convert the strings to indices.
        for key in CATEGORICAL_COLUMNS:
            outputs[key] = tft.string_to_int(inputs[key])

        # For the label column we provide the mapping from string to index.
        def convert_label(label):
            table = lookup.string_to_index_table_from_tensor(['>50K', '<=50K'])
            return table.lookup(label)

        outputs[LABEL_COLUMN] = tft.apply_function(convert_label,
                                                   inputs[LABEL_COLUMN])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # Create a coder to read the census data with the schema.  To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            ordered_columns = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'label'
            ]
            converter = csv_coder.CsvCoder(ordered_columns, raw_data_schema)

            # Read in raw data and convert using CSV converter.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV converter can read, in particular
            # removing empty lines and removing spaces after commas.
            raw_data = (pipeline
                        |
                        'ReadTrainData' >> textio.ReadFromText(train_data_file)
                        | 'FilterTrainData' >> beam.Filter(lambda line: line)
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(', ', ','))
                        | 'DecodeTrainData' >> beam.Map(converter.decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, raw_data_metadata)
            transformed_dataset, transform_fn = (
                raw_dataset
                | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

            #write the beam transform to disk if asked for
            if not transform_graph_dir is None:
                _ = (transform_fn
                     | 'WriteTransformFn' >>
                     tft_beam_io.WriteTransformFn(transform_graph_dir))

            transformed_data, transformed_metadata = transformed_dataset

            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                transformed_train_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            # Now apply transform function to test data.  In this case we also remove
            # the header line from the CSV file and the trailing period at the end of
            # each line.
            raw_test_data = (
                pipeline
                | 'ReadTestData' >> textio.ReadFromText(test_data_file)
                | 'FilterTestData' >> beam.Filter(
                    lambda line: line and line != '|1x3 Cross validator')
                | 'FixCommasTestData' >>
                beam.Map(lambda line: line.replace(', ', ','))
                | 'RemoveTrailingPeriodsTestData' >>
                beam.Map(lambda line: line[:-1])
                | 'DecodeTestData' >> beam.Map(converter.decode))

            raw_test_dataset = (raw_test_data, raw_data_metadata)

            transformed_test_dataset = ((raw_test_dataset, transform_fn)
                                        | beam_impl.TransformDataset())
            # Don't need transformed data schema, it's the same as before.
            transformed_test_data, _ = transformed_test_dataset

            _ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
                transformed_test_filebase,
                coder=example_proto_coder.ExampleProtoCoder(
                    transformed_metadata.schema))

            _ = (transformed_metadata
                 | 'WriteMetadata' >> beam_metadata_io.WriteMetadata(
                     transformed_metadata_dir, pipeline=pipeline))
Ejemplo n.º 10
0
def run(flags, pipeline_args):
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis"""
    options = PipelineOptions(flags=[], **pipeline_args)
    options.view_as(WorkerOptions).machine_type = flags.machine_type
    temp_dir = os.path.join(flags.output_dir, 'tmp')
    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    files = tf.gfile.Glob(flags.input_dir + "*")
    if not flags.cloud:
        files = files[0:
                      20]  # if running locally for testing, process less files

    logging.warning("Number of files: " + str(len(files)))
    labels = get_labels_array(
        "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv"
    )

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):

            input_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC))

            filenames = (p | 'Create filenames' >> beam.Create(files))
            nii = (filenames | 'Read NII' >> beam.Map(read_nii))
            nii_with_labels = (
                nii
                | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels)))

            raw_train, raw_eval, raw_test = (
                nii_with_labels | 'RandomlySplitData' >> randomly_split(
                    train_size=.7, validation_size=.15, test_size=.15))

            raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap(
                lambda x: x[1])
            raw_eval = (raw_eval
                        | 'FlattenEval' >> beam.FlatMap(lambda x: x[1]))
            raw_test = (raw_test
                        | 'FlattenTest' >> beam.FlatMap(lambda x: x[1]))

            raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir)

            dataset_and_metadata, transform_fn = (
                (raw_train, input_metadata)
                | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset(
                    features.preprocess))
            transform_fn = (
                (raw_train, input_metadata)
                |
                'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess))
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(
                     flags.output_dir))
            for dataset_type, dataset in [('Train', raw_train),
                                          ('Eval', raw_eval),
                                          ('Predict', raw_test)]:

                transform_label = 'Transform{}'.format(dataset_type)
                t, metadata = (((dataset, input_metadata), transform_fn)
                               |
                               transform_label >> tft_beam.TransformDataset())
                if dataset_type == 'Train':
                    _ = (metadata
                         | 'WriteMetadata' >>
                         tft_beam_io.WriteMetadata(os.path.join(
                             flags.output_dir, 'transformed_metadata'),
                                                   pipeline=p))
                write_label = 'Write{}TFRecord'.format(dataset_type)
                _ = t | write_label >> WriteTFRecord(
                    dataset_type, flags.output_dir, metadata)