Beispiel #1
0
def main():
    args = parse_args()
    pipeline_options = PipelineOptions(**vars(args))
    pipeline = beam.Pipeline(options=pipeline_options)
    train_files = glob.glob("./mnist_images/train" + os.sep + "*.jpg")
    eval_files = glob.glob("./mnist_images/eval" + os.sep + "*.jpg")
    _ = (
        pipeline
        | 'ListTrainFiles' >> beam.Create(train_files)
        | 'TrainReadFiles' >> beam.Map(lambda path: (read_from_path(path)))
        | 'WriteToTrainTfrecord' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix=path.join("mnist_tfrecords", "train", "train"),
            compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED,
            coder=coders.ExampleProtoCoder(tfrecord_schema()),
            file_name_suffix='.tfrecord'))
    _ = (
        pipeline
        | 'ListEvalFiles' >> beam.Create(eval_files)
        | 'EvalReadFiles' >> beam.Map(lambda path: (read_from_path(path)))
        | 'WriteToEvalTfrecord' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix=path.join("mnist_tfrecords", "eval", "eval"),
            compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED,
            coder=coders.ExampleProtoCoder(tfrecord_schema()),
            file_name_suffix='.tfrecord'))
    pipeline.run().wait_until_finish()
 def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
   pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                          | 'Transform' >> tft.TransformDataset())
   coder = coders.ExampleProtoCoder(metadata.schema)
   _ = (dataset
        | 'SerializeExamples' >> beam.Map(coder.encode)
        | 'WriteExamples' >> beam.io.WriteToTFRecord(
            os.path.join(output_dir, path), file_name_suffix='.tfrecord.gz'))
 def process(self, element):
     element_spec = self._feature_spec.copy()
     for identity in self._optional_field_names:
         if identity not in element:
             del element_spec[identity]
     element_schema = Schema(element_spec)
     coder = coders.ExampleProtoCoder(element_schema)
     encoded_element = coder.encode(element)
     yield encoded_element
def main(argv=None):
    '''Run Preprocessing as a Dataflow pipeline.'''
    args = parse_arguments(sys.argv if argv is None else argv)
    if args.cloud:
        logging.info('Start running in the cloud')
        options = {
            'runner':
            'DataflowRunner',
            'job_name': ('mlengine-boilerplate-{}'.format(
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
            'staging_location':
            os.path.join(args.output_dir, 'staging'),
            'temp_location':
            os.path.join(args.output_dir, 'tmp'),
            'project':
            args.project_id,
            'zone':
            'europe-west1-d',
            'autoscaling_algorithm':
            'THROUGHPUT_BASED',
            'save_main_session':
            True,
            'setup_file':
            './setup.py',
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        print(pipeline_options)
    else:
        pipeline_options = None

    train_coder = coders.ExampleProtoCoder(schema)

    p = beam.Pipeline(options=pipeline_options)

    examples = (p
                | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*',
                                                     skip_header_lines=1)
                | 'buildExamples' >>
                beam.FlatMap(lambda raw_input: buildExample(raw_input)))

    examples_split = examples | beam.Partition(partition_fn, 3)
    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    for part, examples in example_dict.items():
        _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(args.output_dir, part + '_examples'),
            compression_type=filesystem.CompressionTypes.GZIP,
            coder=train_coder,
            file_name_suffix='.gz')

    p.run()
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.

    Args:
        argv (list): list of arguments

    """
    logging.info('running main')
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options(args.project_id,
                                                      args.output_dir)
    else:
        pipeline_options = None

    pipeline = beam.Pipeline(options=pipeline_options)

    all_labels = (pipeline | 'ReadDictionary' >> beam.io.ReadFromText(
        'gs://cloud-ml-data/img/flower_photos/dict.txt',
        strip_trailing_newlines=True))

    examples = (pipeline
                | 'ReadData' >> beam.io.ReadFromText(
                    'gs://cloud-ml-data/img/flower_photos/train_set.csv',
                    strip_trailing_newlines=True)
                | 'Split' >> beam.FlatMap(select_files)
                | 'OneHotEncoding' >> beam.FlatMap(
                    one_hot_encoding, beam.pvalue.AsIter(all_labels))
                | 'ReadImage' >> beam.FlatMap(process_image)
                | 'BuildExamples' >> beam.FlatMap(build_example))

    examples_split = examples | beam.Partition(partition_fn, 3)

    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    train_coder = coders.ExampleProtoCoder(schema)

    for part, examples in example_dict.items():
        examples | part + '_writeExamples' >> \
            beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=os.path.join(
                    args.output_dir, part + '_examples'),
                compression_type=beam.io.filesystem.CompressionTypes.GZIP,
                coder=train_coder,
                file_name_suffix='.tfrecord.gz')

    logging.info('running pipeline')

    pipeline.run().wait_until_finish()
def preprocess(p, args):
    """Run preprocessing as pipeline."""
    train_eval_schema = _make_input_schema()

    train_eval_metadata = dataset_metadata.DatasetMetadata(
        schema=train_eval_schema)

    _ = (train_eval_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(
             args.output_dir, constants.RAW_METADATA_DIR),
                                                             pipeline=p))

    train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read(
        beam.io.BigQuerySource(query=_get_query('bigquery-public-data',
                                                'samples', 'gsod'),
                               use_standard_sql=True)))

    train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo(
        DataValidator())

    (transformed_train_eval_data,
     transformed_train_eval_metadata), transform_fn = (
         (train_eval_data, train_eval_metadata)
         | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
             get_preprocessing_fn()))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

    transformed_train_eval_coder = coders.ExampleProtoCoder(
        transformed_train_eval_metadata.schema)

    transformed_train_data, transformed_eval_data = (
        transformed_train_eval_data
        | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2))

    (transformed_train_data
     |
     'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))

    (transformed_eval_data
     | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteEval' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))
def write_tfrecord(p, prefix, output_dir, metadata):
    """Shuffles and write the given pCollection as a TFRecord.

    Args:
        p: a pCollection.
        prefix: prefix for location tf-record will be written to.
        output_dir: the directory or bucket to write the json data.
        metadata: metadata of input data from tft_beam.TransformDataset(...)
    """

    coder = coders.ExampleProtoCoder(metadata.schema)
    prefix = str(prefix).lower()
    (p
     | 'ShuffleData' >> shuffle()
     | 'WriteTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(output_dir, 'data', prefix, prefix),
         coder=coder,
         file_name_suffix='.tfrecord'))
Beispiel #8
0
    def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
        pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                               | 'Transform' >> tft.TransformDataset())
        #coder = criteo.make_csv_coder(input_schema, delimiter)
        #coder = coders.ExampleProtoCoder(metadata.schema)
        column_names = ['clicked']
        #for name in INTEGER_COLUMN_NAMES:
        #  column_names.append(name)
        #for name in CATEGORICAL_COLUMN_NAMES:
        #  column_names.append(name)

        #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",")
        coder = coders.ExampleProtoCoder(metadata.schema)
        _ = (dataset
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_dir, path),
                 file_name_suffix='.tfrecord.gz'))
Beispiel #9
0
  def __init__(self,
               feature_spec,
               optional_field_names,
               rule_optional_fn=lambda x: x < 0):
    """Initialises a TF-Record decoder.

    Args:
      feature_spec: Dictionary from feature names to one of `FixedLenFeature`,
        `SparseFeature` or `VarLenFeature. It contains all the features to parse
        (including optional ones).
      optional_field_names: list of optional fields.
      rule_optional_fn: function that take the value of an optional field and
        returns True if the value is indicative of a default value (e.g.
        resulting from the default value of parsing FixedLenFeature).  Current
        code requires that all optional_field_names share the rule_optional_fn.
    """
    self._schema = Schema(feature_spec)
    self._coder = coders.ExampleProtoCoder(self._schema)
    self._optional_field_names = optional_field_names
    self._rule_optional_fn = rule_optional_fn
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.

    Args:
        argv (list): list of arguments

    """
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options(args.project_id,
                                                      args.output_dir)
    else:
        pipeline_options = None

    pipeline = beam.Pipeline(options=pipeline_options)

    examples = (
        pipeline
        # | 'ReadData' >> beam.Create(open('data/test.csv')
        #                             .readlines()[1:])
        |
        'ReadData' >> beam.io.ReadFromText(DATA_DIR + '*', skip_header_lines=1)
        | 'BuildExamples' >> beam.FlatMap(build_example))

    examples_split = examples | beam.Partition(partition_fn, 3)

    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    for part, examples in example_dict.items():
        examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(TFRECORD_DIR, part + '_examples'),
            compression_type=filesystem.CompressionTypes.GZIP,
            coder=coders.ExampleProtoCoder(schema),
            file_name_suffix='.tfrecord.gz')

    pipeline.run().wait_until_finish()
Beispiel #11
0
def WriteOutput(p, prefix, output_dir, feature_spec, plain_text=False):
    """Writes the given pCollection as a TF-Record.

  Args:
    p: a pCollection.
    prefix: prefix for location tf-record will be written to.
    output_dir: the directory or bucket to write the json data.
    feature_spec: the feature spec of the tf-record to be written.
    plain_text: if true, write the output as plain text instead.
  """
    path = os.path.join(output_dir, prefix)
    shuffled = p | "ShuffleData" >> Shuffle()  # pylint: disable=no-value-for-parameter

    if plain_text:
        shuffled | "WriteToText" >> beam.io.WriteToText(
            path, file_name_suffix=".txt")
        return

    schema = dataset_schema.from_feature_spec(feature_spec)
    coder = coders.ExampleProtoCoder(schema)
    shuffled | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
        path, coder=coder, file_name_suffix=".tfrecord")
Beispiel #12
0
def WriteTFRecord(p, prefix, output_dir, metadata):
    """Shuffles and write the given pCollection as a TF-Record.
    Args:
        p: a pCollection.
        prefix: prefix for location tf-record will be written to.
        output_dir: the directory or bucket to write the json data.
        metadata
    """
    coder = coders.ExampleProtoCoder(metadata.schema)
    prefix = str(prefix).lower()
    out_dir = os.path.join(output_dir, 'data', prefix, prefix)

    # Examples are large, so we should ensure the TFRecords are relatively small
    num_shards = 60 if prefix == 'train' else 20
    logging.warning("writing TFrecords to " + out_dir)
    _ = (
        p
        | "ShuffleData" >> shuffle()  # pylint: disable=no-value-for-parameter
        | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
            os.path.join(output_dir, 'data', prefix, prefix),
            coder=coder,
            num_shards=num_shards,
            file_name_suffix=".tfrecord"))
Beispiel #13
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold, delimiter):
    """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
    delimiter: the column delimiter for the CSV format.
  """
    # 1) The schema can be either defined in-memory or read from a configuration
    #    file, in this case we are creating the schema in-memory.
    input_schema = criteo.make_input_schema()

    # 2) Configure the coder to map the source file column names to a dictionary
    #    of key -> tensor_proto with the appropiate type derived from the
    #    input_schema.
    coder = criteo.make_csv_coder(input_schema, delimiter)

    # 3) Read from text using the coder.
    train_data = (pipeline
                  | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
                  | 'ParseTrainingCsv' >> beam.Map(coder.decode))

    evaluate_data = (pipeline
                     | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
                     | 'ParseEvalCsv' >> beam.Map(coder.decode))

    input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
    _ = (input_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
             os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
             pipeline=pipeline))

    preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
    transform_fn = ((train_data, input_metadata)
                    | 'Analyze' >> tft.AnalyzeDataset(preprocessing_fn))

    # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
    # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
    # path_constants.TRANSFORMED_METADATA_DIR.
    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

    @beam.ptransform_fn
    def TransformAndWrite(pcoll, path):  # pylint: disable=invalid-name
        pcoll |= 'Shuffle' >> _Shuffle()  # pylint: disable=no-value-for-parameter
        (dataset, metadata) = (((pcoll, input_metadata), transform_fn)
                               | 'Transform' >> tft.TransformDataset())
        #coder = criteo.make_csv_coder(input_schema, delimiter)
        #coder = coders.ExampleProtoCoder(metadata.schema)
        column_names = ['clicked']
        #for name in INTEGER_COLUMN_NAMES:
        #  column_names.append(name)
        #for name in CATEGORICAL_COLUMN_NAMES:
        #  column_names.append(name)

        #coder = coders.CsvCoder(column_names, metadata.schema, delimiter=",")
        coder = coders.ExampleProtoCoder(metadata.schema)
        _ = (dataset
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(output_dir, path),
                 file_name_suffix='.tfrecord.gz'))

    _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX)

    _ = evaluate_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX)

    # TODO(b/35300113) Remember to eventually also save the statistics.

    if predict_data:
        predict_mode = tf.contrib.learn.ModeKeys.INFER
        predict_schema = criteo.make_input_schema(mode=predict_mode)
        csv_coder = criteo.make_csv_coder(predict_schema, mode=predict_mode)
        predict_coder = coders.ExampleProtoCoder(predict_schema)
        serialized_examples = (
            pipeline
            | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
            | 'ParsePredictCsv' >> beam.Map(csv_coder.decode)
            # TODO(b/35194257) Obviate the need for this explicit serialization.
            | 'EncodePredictData' >> beam.Map(predict_coder.encode))
        _ = (serialized_examples
             | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.tfrecord.gz'))
        _ = (serialized_examples
             | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
             | 'WritePredictDataAsText' >> beam.io.WriteToText(
                 os.path.join(
                     output_dir,
                     path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
                 file_name_suffix='.txt'))
Beispiel #14
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: file paths to input csv files.
    eval_data: file paths to input csv files.
    predict_data: file paths to input csv files.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """
  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = criteo.make_input_schema()

  # 2) Configure the coder to map the source file column names to a dictionary
  #    of key -> tensor_proto with the appropiate type derived from the
  #    input_schema.
  coder = criteo.make_tsv_coder(input_schema)

  # 3) Read from text using the coder.
  train_data = (
      pipeline
      | 'ReadTrainingData' >> beam.io.ReadFromText(training_data)
      | 'ParseTrainingCsv' >> beam.Map(coder.decode))

  evaluate_data = (
      pipeline
      | 'ReadEvalData' >> beam.io.ReadFromText(eval_data)
      | 'ParseEvalCsv' >> beam.Map(coder.decode))

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)
  _ = (input_metadata
       | 'WriteInputMetadata' >> io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  # TODO(b/33688220) should the transform functions take shuffle as an optional
  # argument?
  # TODO(b/33688275) Should the transform functions have more user friendly
  # names?
  preprocessing_fn = criteo.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir))

  # TODO(b/34231369) Remember to eventually also save the statistics.

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  _ = (train_dataset
       | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
       | 'WriteTraining'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  _ = (evaluate_dataset
       | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
       | 'WriteEval'
       >> beam.io.WriteToTFRecord(
           os.path.join(output_dir,
                        path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
           file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = criteo.make_input_schema(mode=predict_mode)
    tsv_coder = criteo.make_tsv_coder(predict_schema, mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)
    serialized_examples = (
        pipeline
        | 'ReadPredictData' >> beam.io.ReadFromText(predict_data)
        | 'ParsePredictCsv' >> beam.Map(tsv_coder.decode)
        # TODO(b/35194257) Obviate the need for this explicit serialization.
        | 'EncodePredictData' >> beam.Map(predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
def preprocess(pipeline, args):
  input_metadata = metadata_io.read_metadata(
      os.path.join(args.analyze_output_dir, RAW_METADATA_DIR))

  schema = json.loads(file_io.read_file_to_string(
      os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode())
  features = json.loads(file_io.read_file_to_string(
      os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode())

  column_names = [col['name'] for col in schema]

  exclude_outputs = None
  if not args.target:
    for name, transform in six.iteritems(features):
      if transform['transform'] == TARGET_TRANSFORM:
        target_name = name
        column_names.remove(target_name)
        exclude_outputs = [target_name]
        del input_metadata.schema.column_schemas[target_name]
        break

  if args.csv_file_pattern:
    coder = coders.CsvCoder(column_names, input_metadata.schema, delimiter=',')
    raw_data = (
        pipeline
        | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern)
        | 'ParseCsvData' >> beam.Map(coder.decode))
  else:
    columns = ', '.join(column_names)
    query = 'SELECT {columns} FROM `{table}`'.format(columns=columns,
                                                     table=args.bigquery_table)
    raw_data = (
        pipeline
        | 'ReadBiqQueryData'
        >> beam.io.Read(beam.io.BigQuerySource(query=query,
                                               use_standard_sql=True)))

  # Note that prepare_image_transforms does not make embeddints, it justs reads
  # the image files and converts them to base64 stings. tft.TransformDataset()
  # will apply the saved model that makes the image embeddings.
  image_columns = image_transform_columns(features)
  raw_data = (
      raw_data
      | 'PreprocessTransferredLearningTransformations'
      >> beam.Map(prepare_image_transforms, image_columns))

  if args.shuffle:
    raw_data = raw_data | 'ShuffleData' >> shuffle()

  transform_fn = (
      pipeline
      | 'ReadTransformFn'
      >> tft_beam_io.ReadTransformFn(args.analyze_output_dir))

  (transformed_data, transform_metadata) = (
      ((raw_data, input_metadata), transform_fn)
      | 'ApplyTensorflowPreprocessingGraph' 
      >> tft.TransformDataset(exclude_outputs))

  tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema)
  _ = (transformed_data
       | 'SerializeExamples' >> beam.Map(tfexample_coder.encode)
       | 'WriteExamples'
       >> beam.io.WriteToTFRecord(
           os.path.join(args.output_dir, args.output_filename_prefix),
           file_name_suffix='.tfrecord.gz'))
Beispiel #16
0
def _make_proto_coder(schema):
    raw_feature_spec = _get_raw_feature_spec(schema)
    raw_schema = schema_utils.schema_from_feature_spec(raw_feature_spec)
    return tft_coders.ExampleProtoCoder(raw_schema)
Beispiel #17
0
def _make_proto_coder(schema):
    raw_feature_spec = _get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    return tft_coders.ExampleProtoCoder(raw_schema)
Beispiel #18
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """
  work_dir = os.path.join(output_dir, path_constants.TEMP_DIR)

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  # TODO(b/33688220) should the transform functions take shuffle as an optional
  # argument?
  # TODO(b/33688275) Should the transform functions have more user friendly
  # names?
  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn, work_dir))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn | 'WriteTransformFn' >> io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    # TODO(b/35653662): Simplify once tf.transform 0.1.5 is released.
    def encode_predict_data(d):
      try:
        return predict_coder.encode(d)
      except Exception:  # pylint: disable=broad-except
        # Compatibility path for tf.transform < 0.1.5
        return predict_coder.encode({
            k: v.encode('utf-8') if isinstance(v, unicode) else v
            for k, v in d.items()
        })

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               encode_predict_data))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))
  _ = (raw_metadata
       | 'WriteRawMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(args.output_dir, 'raw_metadata'), pipeline))

  preprocessing_fn = movielens.make_preprocessing_fn()
  train_features_transformed, transform_fn = (
      (train_data, raw_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  eval_features_transformed = (
      ((eval_data, raw_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  train_dataset_transformed, train_metadata = train_features_transformed
  training_coder = tft_coders.ExampleProtoCoder(train_metadata.schema)
  _ = (train_dataset_transformed
       | 'EncodeTraining' >> beam.Map(training_coder.encode)
       | 'ShuffleTraining' >> (
           _Shuffle())  # pylint: disable=no-value-for-parameter
       | 'WriteTraining' >> beam.io.WriteToTFRecord(
           os.path.join(args.output_dir, 'features_train'),
           file_name_suffix='.tfrecord.gz'))
  _ = (train_metadata
       | 'WriteTransformedMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(args.output_dir, 'transformed_metadata'), pipeline))

  eval_dataset_transformed, eval_metadata = eval_features_transformed
  eval_coder = tft_coders.ExampleProtoCoder(eval_metadata.schema)

  prediction_schema = movielens.make_prediction_schema()
Beispiel #20
0
         | 'SerializeExamples' >> beam.Map(coder.encode)
         | 'WriteExamples' >> beam.io.WriteToTFRecord(
             os.path.join(args.output_dir, path),
             file_name_suffix='.tfrecord.gz'))

  _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
      'features_train')

  _ = eval_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
      'features_eval')

  # TODO(b/35300113) Remember to eventually also save the statistics.

  # Save files for online and batch prediction.
  prediction_schema = movielens.make_prediction_schema()
  prediction_coder = tft_coders.ExampleProtoCoder(prediction_schema)
  prediction_data = (
      eval_data
      | 'EncodePrediction' >> beam.Map(prediction_coder.encode))
  _ = (prediction_data
       | 'EncodePredictionAsB64Json' >> beam.Map(_encode_as_b64_json)
       | 'WritePredictDataAsText' >> beam.io.WriteToText(
           os.path.join(args.output_dir, 'features_predict'),
           file_name_suffix='.txt'))
  _ = (prediction_data
       | 'WritePredictDataAsTfRecord' >> beam.io.WriteToTFRecord(
           os.path.join(args.output_dir, 'features_predict'),
           file_name_suffix='.tfrecord.gz'))


def _encode_as_b64_json(serialized_example):
Beispiel #21
0
        for key in NUMERIC_FEATURE_KEYS
    })
    feature_spec.update({
        key: tf.io.FixedLenFeature([], tf.int64)
        for key in NUMERIC_FEATURE_KEYS_INT
    })

    raw_data_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    return raw_data_metadata


RAW_DATA_METADATA = _create_raw_metadata()

csv_coder_ = csv_coder.CsvCoder(ORDERED_COLUMNS, RAW_DATA_METADATA.schema)
proto_coder = tft_coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)


def cus_input(one_line):
    one_example = csv_coder_.decode(one_line)
    serialized_example = proto_coder.encode(one_example)
    json_example = {"inputs": {
        "b64": base64.b64encode(serialized_example).decode()}}
    return json_example


def predict_json(request):
    """
    You need to headcode project, model and version
    """
    project = 'eeeooosss'
Beispiel #22
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir,
               frequency_threshold):
  """Run pre-processing step as a pipeline.

  Args:
    pipeline: beam pipeline
    training_data: the name of the table to train on.
    eval_data: the name of the table to evaluate on.
    predict_data: the name of the table to predict on.
    output_dir: file path to where to write all the output files.
    frequency_threshold: frequency threshold to use for categorical values.
  """

  # 1) The schema can be either defined in-memory or read from a configuration
  #    file, in this case we are creating the schema in-memory.
  input_schema = reddit.make_input_schema()

  # 2) Read from BigQuery or from CSV.
  train_data = pipeline | 'ReadTrainingData' >> _ReadData(training_data)
  evaluate_data = pipeline | 'ReadEvalData' >> _ReadData(eval_data)

  input_metadata = dataset_metadata.DatasetMetadata(schema=input_schema)

  _ = (input_metadata
       | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(
           os.path.join(output_dir, path_constants.RAW_METADATA_DIR),
           pipeline=pipeline))

  preprocessing_fn = reddit.make_preprocessing_fn(frequency_threshold)
  (train_dataset, train_metadata), transform_fn = (
      (train_data, input_metadata)
      | 'AnalyzeAndTransform' >> tft.AnalyzeAndTransformDataset(
          preprocessing_fn))

  # WriteTransformFn writes transform_fn and metadata to fixed subdirectories
  # of output_dir, which are given by path_constants.TRANSFORM_FN_DIR and
  # path_constants.TRANSFORMED_METADATA_DIR.
  _ = (transform_fn
       | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))

  (evaluate_dataset, evaluate_metadata) = (
      ((evaluate_data, input_metadata), transform_fn)
      | 'TransformEval' >> tft.TransformDataset())

  # pylint: disable=expression-not-assigned
  # TODO(b/34231369) Remember to eventually also save the statistics and the
  # metadata.

  train_coder = coders.ExampleProtoCoder(train_metadata.schema)
  (train_dataset
   | 'SerializeTrainExamples' >> beam.Map(train_coder.encode)
   | 'ShuffleTraining' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteTraining' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  evaluate_coder = coders.ExampleProtoCoder(evaluate_metadata.schema)
  (evaluate_dataset
   | 'SerializeEvalExamples' >> beam.Map(evaluate_coder.encode)
   | 'ShuffleEval' >> _Shuffle()  # pylint: disable=no-value-for-parameter
   | 'WriteEval' >> beam.io.WriteToTFRecord(
       os.path.join(output_dir,
                    path_constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
       file_name_suffix='.tfrecord.gz'))

  if predict_data:
    predict_mode = tf.contrib.learn.ModeKeys.INFER
    predict_schema = reddit.make_input_schema(mode=predict_mode)
    predict_coder = coders.ExampleProtoCoder(predict_schema)

    serialized_examples = (pipeline
                           | 'ReadPredictData' >> _ReadData(
                               predict_data, mode=predict_mode)
                           # TODO(b/35194257) Obviate the need for this explicit
                           # serialization.
                           | 'EncodePredictData' >> beam.Map(
                               predict_coder.encode))
    _ = (serialized_examples
         | 'WritePredictDataAsTFRecord' >> beam.io.WriteToTFRecord(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.tfrecord.gz'))
    _ = (serialized_examples
         | 'EncodePredictAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(output_dir,
                          path_constants.TRANSFORMED_PREDICT_DATA_FILE_PREFIX),
             file_name_suffix='.txt'))