Beispiel #1
0
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline, input_dict, exec_properties):  # pylint: disable=unused-argument
    """Read CSV file and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input-base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.

  Returns:
    PCollection of TF examples.
  """
    input_base = types.get_single_instance(input_dict['input-base'])
    input_base_uri = input_base.uri
    csv_uri = io_utils.get_only_uri_in_dir(input_base_uri)
    tf.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_uri))

    return (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1)
        | 'ParseCSV' >> csv_decoder.DecodeCSV(
            io_utils.load_csv_column_names(csv_uri))
        | 'ToTFExample' >> beam.Map(_dict_to_example))
  def test_csv_decoder_with_schema(self):
    input_lines = ['1,1,2.0,hello',
                   '5,5,12.34,world']
    column_names = ['int_feature_parsed_as_float', 'int_feature',
                    'float_feature', 'str_feature']
    schema = text_format.Parse(
        """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
    expected_result = [
        {'int_feature_parsed_as_float': np.array([1], dtype=np.float32),
         'int_feature': np.array([1], dtype=np.int64),
         'float_feature': np.array([2.0], dtype=np.float32),
         'str_feature': np.array([b'hello'], dtype=np.object)},
        {'int_feature_parsed_as_float': np.array([5], dtype=np.float32),
         'int_feature': np.array([5], dtype=np.int64),
         'float_feature': np.array([12.34], dtype=np.float32),
         'str_feature': np.array([b'world'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names, schema=schema,
                                      infer_type_from_schema=True))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_schema(self):
    input_lines = ['1,1,2.0,hello',
                   '5,5,12.34,world']
    column_names = ['int_feature_parsed_as_float', 'int_feature',
                    'float_feature', 'str_feature']
    schema = text_format.Parse(
        """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
    expected_result = [
        pa.RecordBatch.from_arrays([
            pa.array([[1], [5]], pa.list_(pa.float32())),
            pa.array([[1], [5]], pa.list_(pa.int64())),
            pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
            pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
        ], [
            'int_feature_parsed_as_float', 'int_feature', 'float_feature',
            'str_feature'
        ])
    ]

    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(
              column_names=column_names,
              schema=schema,
              infer_type_from_schema=True))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result))
Beispiel #4
0
def _CsvToSerializedExample(  # pylint: disable=invalid-name
    pipeline, csv_uri):
  """Read csv file and transform to tf examples."""
  return (pipeline
          |
          'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1)
          | 'ParseCSV' >> csv_decoder.DecodeCSV(
              io_utils.load_csv_column_names(csv_uri))
          | 'ToSerializedTFExample' >> beam.Map(_dict_to_example))
    def test_csv_decoder_empty_csv(self):
        input_lines = []
        expected_result = []

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=[]))
            util.assert_that(
                result, _make_example_dict_equal_fn(self, expected_result))
    def test_csv_decoder_skip_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['int_feature']
        expected_result = [{'int_feature': np.array([1], dtype=np.integer)}]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result, _make_example_dict_equal_fn(self, expected_result))
Beispiel #7
0
    def test_csv_decoder_empty_csv(self):
        input_lines = []
        expected_result = []

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=[]))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
    def test_csv_decoder_invalid_row(self):
        input_lines = ['1,2.0,hello', '5,12.34']
        column_names = ['int_feature', 'float_feature', 'str_feature']

        with self.assertRaisesRegexp(
                ValueError, '.*Columns do not match specified csv headers.*'):
            with beam.Pipeline() as p:
                result = (p | beam.Create(input_lines)
                          | csv_decoder.DecodeCSV(column_names=column_names))
                util.assert_that(
                    result, test_util.make_example_dict_equal_fn(self, None))
    def test_csv_decoder_invalid_row(self):
        input_lines = ['1,2.0,hello', '5,12.34']
        column_names = ['int_feature', 'float_feature', 'str_feature']

        with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
                ValueError, '.*Columns do not match specified csv headers.*'):
            with beam.Pipeline() as p:
                result = (p | beam.Create(input_lines, reshuffle=False)
                          | csv_decoder.DecodeCSV(column_names=column_names))
                util.assert_that(
                    result,
                    test_util.make_arrow_record_batches_equal_fn(self, None))
  def test_csv_decoder_negative_values(self):
    input_lines = ['-34', '45']
    column_names = ['feature']
    expected_result = [
        {'feature': np.array([-34], dtype=np.int64)},
        {'feature': np.array([45], dtype=np.int64)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_int64_max(self):
    input_lines = ['34', str(sys.maxsize)]
    column_names = ['feature']
    expected_result = [
        {'feature': np.array([34], dtype=np.int64)},
        {'feature': np.array([sys.maxsize], dtype=np.int64)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
def compute_stats(input_handle,
                  stats_path,
                  max_rows=None,
                  for_eval=False,
                  pipeline_args=None):
    """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        if input_handle.lower().endswith('csv'):
            raw_data = (pipeline
                        | 'ReadData' >> beam.io.textio.ReadFromText(
                            file_pattern=input_handle, skip_header_lines=1)
                        | 'DecodeData' >> csv_decoder.DecodeCSV(
                            column_names=taxi.CSV_COLUMN_NAMES))
        else:
            query = taxi.make_sql(table_name=input_handle,
                                  max_rows=max_rows,
                                  for_eval=for_eval)
            raw_data = (
                pipeline
                | 'ReadBigQuery' >> beam.io.Read(
                    beam.io.BigQuerySource(query=query, use_standard_sql=True))
                | 'ConvertToTFDVInput' >> beam.Map(
                    lambda x: {
                        key: np.asarray([x[key]])  # pylint: disable=g-long-lambda
                        for key in x if x[key] is not None
                    }))
            # TODO(pachristopher): Remove this once TFDV 0.14 is released.
            (major, minor, _) = tfdv.__version__.split('.')
            if int(major) > 0 or int(minor) >= 14:
                raw_data |= ('BatchExamplesToArrowTables' >>
                             batch_util.BatchExamplesToArrowTables())

        _ = (raw_data
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 stats_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))
    def test_csv_decoder_consider_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['float_feature']
        expected_result = [{
            'float_feature': None
        }, {
            'float_feature': np.array([1.0], dtype=np.floating)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, skip_blank_lines=False))
            util.assert_that(
                result, _make_example_dict_equal_fn(self, expected_result))
Beispiel #14
0
    def test_csv_decoder_consider_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['int_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([None, [1]], pa.list_(pa.int64())),
            ], ['int_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, skip_blank_lines=False))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #15
0
    def test_csv_decoder_int64_max(self):
        input_lines = ['34', str(sys.maxsize)]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[34], [sys.maxsize]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #16
0
    def test_csv_decoder_negative_values(self):
        input_lines = ['-34', '45']
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[-34], [45]], pa.list_(pa.int64())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #17
0
    def test_csv_decoder_large_int_categorical_neg(self):
        input_lines = ['34', str(-(sys.maxsize + 2))]
        column_names = ['feature']
        expected_result = [{
            'feature': np.array(['34'], dtype=np.object)
        }, {
            'feature':
            np.array([str(-(sys.maxsize + 2))], dtype=np.object)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result, _make_example_dict_equal_fn(self, expected_result))
Beispiel #18
0
    def test_csv_decoder_skip_blank_line(self):
        input_lines = ['', '1,2']
        column_names = ['int_feature1', 'int_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1]], pa.list_(pa.int64())),
                pa.array([[2]], pa.list_(pa.int64())),
            ], ['int_feature1', 'int_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines, reshuffle=False)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #19
0
    def test_csv_decoder_with_int_and_float_in_same_column(self):
        input_lines = ['2,1.5', '1.5,2']
        column_names = ['float_feature1', 'float_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[2.0], [1.5]], pa.list_(pa.float32())),
                pa.array([[1.5], [2.0]], pa.list_(pa.float32())),
            ], ['float_feature1', 'float_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #20
0
    def test_csv_decoder_with_tab_delimiter(self):
        input_lines = ['1\t"this is a \ttext"', '5\t']
        column_names = ['int_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, delimiter='\t'))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #21
0
    def test_csv_decoder_with_float_and_string_in_same_column(self):
        input_lines = ['2.3,abc', 'abc,2.3']
        column_names = ['str_feature1', 'str_feature2']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'2.3'], [b'abc']], pa.list_(pa.binary())),
                pa.array([[b'abc'], [b'2.3']], pa.list_(pa.binary())),
            ], ['str_feature1', 'str_feature2'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
  def test_csv_decoder_csv_record_with_quotes(self):
    input_lines = ['1,"ab,cd,ef"',
                   '5,"wx,xy,yz"']
    column_names = ['int_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'str_feature': np.array([b'ab,cd,ef'], dtype=np.object)},
        {'int_feature': np.array([5], dtype=np.int64),
         'str_feature': np.array([b'wx,xy,yz'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
Beispiel #23
0
    def test_csv_decoder_large_int_categorical_neg(self):
        input_lines = ['34', str(-(sys.maxsize + 2))]
        column_names = ['feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[b'34'], [str(-(sys.maxsize + 2)).encode('utf-8')]],
                         pa.list_(pa.binary())),
            ], ['feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
  def test_csv_decoder_with_float_and_string_in_same_column(self):
    input_lines = ['2.3,abc',
                   'abc,2.3']
    column_names = ['str_feature1', 'str_feature2']
    expected_result = [
        {'str_feature1': np.array([b'2.3'], dtype=np.object),
         'str_feature2': np.array([b'abc'], dtype=np.object)},
        {'str_feature1': np.array([b'abc'], dtype=np.object),
         'str_feature2': np.array([b'2.3'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_int_and_float_in_same_column(self):
    input_lines = ['2,1.5',
                   '1.5,2']
    column_names = ['float_feature1', 'float_feature2']
    expected_result = [
        {'float_feature1': np.array([2.0], dtype=np.float32),
         'float_feature2': np.array([1.5], dtype=np.float32)},
        {'float_feature1': np.array([1.5], dtype=np.float32),
         'float_feature2': np.array([2.0], dtype=np.float32)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
Beispiel #26
0
    def test_csv_decoder_missing_values(self):
        input_lines = ['1,,hello', ',12.34,']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], None], pa.list_(pa.int64())),
                pa.array([None, [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], None], pa.list_(pa.binary())),
            ], ['int_feature', 'float_feature', 'str_feature'])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
  def test_csv_decoder_large_int_categorical_pos(self):
    input_lines = ['34', str(sys.maxsize+1)]
    column_names = ['feature']
    expected_result = [
        pa.RecordBatch.from_arrays([
            pa.array([[b'34'], [str(sys.maxsize + 1).encode('utf-8')]],
                     pa.list_(pa.binary())),
        ], ['feature'])
    ]

    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False)
          | csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result))
  def test_csv_decoder_with_tab_delimiter(self):
    input_lines = ['1\t"this is a \ttext"',
                   '5\t']
    column_names = ['int_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'str_feature': np.array([b'this is a \ttext'], dtype=np.object)},
        {'int_feature': np.array([5], dtype=np.int64),
         'str_feature': None}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names,
                                      delimiter='\t'))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_unicode(self):
    input_lines = [u'1,שקרכלשהו,22.34,text field']
    column_names = ['int_feature', 'unicode_feature',
                    'float_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'unicode_feature': np.array([u'שקרכלשהו'.encode('utf-8')],
                                     dtype=np.object),
         'float_feature': np.array([22.34], dtype=np.float32),
         'str_feature': np.array([b'text field'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
Beispiel #30
0
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.TfxArtifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = types.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    tf.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.gfile.Glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    return (pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                     skip_header_lines=1)
            | 'ParseCSV' >> csv_decoder.DecodeCSV(column_names)
            | 'ToTFExample' >> beam.Map(_dict_to_example))