def test_csv_decoder_with_schema(self):
    input_lines = ['1,1,2.0,hello',
                   '5,5,12.34,world']
    column_names = ['int_feature_parsed_as_float', 'int_feature',
                    'float_feature', 'str_feature']
    schema = text_format.Parse(
        """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
    expected_result = [
        {'int_feature_parsed_as_float': np.array([1], dtype=np.float32),
         'int_feature': np.array([1], dtype=np.int64),
         'float_feature': np.array([2.0], dtype=np.float32),
         'str_feature': np.array([b'hello'], dtype=np.object)},
        {'int_feature_parsed_as_float': np.array([5], dtype=np.float32),
         'int_feature': np.array([5], dtype=np.int64),
         'float_feature': np.array([12.34], dtype=np.float32),
         'str_feature': np.array([b'world'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names, schema=schema,
                                      infer_type_from_schema=True))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
Exemple #2
0
 def test_decode_example_with_beam_pipeline(self):
     example_proto_text = """
 features {
   feature { key: "int_feature_1"
             value { int64_list { value: [ 0 ] } } }
   feature { key: "int_feature_2"
             value { int64_list { value: [ 1, 2, 3 ] } } }
   feature { key: "float_feature_1"
             value { float_list { value: [ 4.0 ] } } }
   feature { key: "float_feature_2"
             value { float_list { value: [ 5.0, 6.0 ] } } }
   feature { key: "str_feature_1"
             value { bytes_list { value: [ 'female' ] } } }
   feature { key: "str_feature_2"
             value { bytes_list { value: [ 'string', 'list' ] } } }
 }
 """
     expected_decoded = {
         'int_feature_1': np.array([0], dtype=np.integer),
         'int_feature_2': np.array([1, 2, 3], dtype=np.integer),
         'float_feature_1': np.array([4.0], dtype=np.floating),
         'float_feature_2': np.array([5.0, 6.0], dtype=np.floating),
         'str_feature_1': np.array([b'female'], dtype=np.object),
         'str_feature_2': np.array([b'string', b'list'], dtype=np.object),
     }
     example = tf.train.Example()
     text_format.Merge(example_proto_text, example)
     with beam.Pipeline() as p:
         result = (p
                   | beam.Create([example.SerializeToString()])
                   | tf_example_decoder.DecodeTFExample())
         util.assert_that(
             result,
             test_util.make_example_dict_equal_fn(self, [expected_decoded]))
    def test_csv_decoder_empty_csv(self):
        input_lines = []
        expected_result = []

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=[]))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))
Exemple #4
0
 def test_decode_example_with_beam_pipeline(self, example_proto_text,
                                            decoded_example):
     example = tf.train.Example()
     text_format.Merge(example_proto_text, example)
     with beam.Pipeline() as p:
         result = (p
                   | beam.Create([example.SerializeToString()])
                   | tf_example_decoder.DecodeTFExample())
         util.assert_that(
             result,
             test_util.make_example_dict_equal_fn(self, [decoded_example]))
    def test_csv_decoder_invalid_row(self):
        input_lines = ['1,2.0,hello', '5,12.34']
        column_names = ['int_feature', 'float_feature', 'str_feature']

        with self.assertRaisesRegexp(
                ValueError, '.*Columns do not match specified csv headers.*'):
            with beam.Pipeline() as p:
                result = (p | beam.Create(input_lines)
                          | csv_decoder.DecodeCSV(column_names=column_names))
                util.assert_that(
                    result, test_util.make_example_dict_equal_fn(self, None))
    def test_csv_decoder_skip_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['int_feature']
        expected_result = [{'int_feature': np.array([1], dtype=np.integer)}]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_int64_max(self):
    input_lines = ['34', str(sys.maxsize)]
    column_names = ['feature']
    expected_result = [
        {'feature': np.array([34], dtype=np.int64)},
        {'feature': np.array([sys.maxsize], dtype=np.int64)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_negative_values(self):
    input_lines = ['-34', '45']
    column_names = ['feature']
    expected_result = [
        {'feature': np.array([-34], dtype=np.int64)},
        {'feature': np.array([45], dtype=np.int64)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
    def test_csv_decoder_consider_blank_line_single_column(self):
        input_lines = ['', '1']
        column_names = ['float_feature']
        expected_result = [{
            'float_feature': None
        }, {
            'float_feature': np.array([1.0], dtype=np.floating)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(
                column_names=column_names, skip_blank_lines=False))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_int_and_float_in_same_column(self):
    input_lines = ['2,1.5',
                   '1.5,2']
    column_names = ['float_feature1', 'float_feature2']
    expected_result = [
        {'float_feature1': np.array([2.0], dtype=np.float32),
         'float_feature2': np.array([1.5], dtype=np.float32)},
        {'float_feature1': np.array([1.5], dtype=np.float32),
         'float_feature2': np.array([2.0], dtype=np.float32)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
    def test_csv_decoder_large_int_categorical_pos(self):
        input_lines = ['34', str(sys.maxsize + 1)]
        column_names = ['feature']
        expected_result = [{
            'feature': np.array([b'34'], dtype=np.object)
        }, {
            'feature':
            np.array([str(sys.maxsize + 1).encode('utf-8')], dtype=np.object)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_float_and_string_in_same_column(self):
    input_lines = ['2.3,abc',
                   'abc,2.3']
    column_names = ['str_feature1', 'str_feature2']
    expected_result = [
        {'str_feature1': np.array([b'2.3'], dtype=np.object),
         'str_feature2': np.array([b'abc'], dtype=np.object)},
        {'str_feature1': np.array([b'abc'], dtype=np.object),
         'str_feature2': np.array([b'2.3'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_csv_record_with_quotes(self):
    input_lines = ['1,"ab,cd,ef"',
                   '5,"wx,xy,yz"']
    column_names = ['int_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'str_feature': np.array([b'ab,cd,ef'], dtype=np.object)},
        {'int_feature': np.array([5], dtype=np.int64),
         'str_feature': np.array([b'wx,xy,yz'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_tab_delimiter(self):
    input_lines = ['1\t"this is a \ttext"',
                   '5\t']
    column_names = ['int_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'str_feature': np.array([b'this is a \ttext'], dtype=np.object)},
        {'int_feature': np.array([5], dtype=np.int64),
         'str_feature': None}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names,
                                      delimiter='\t'))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_with_unicode(self):
    input_lines = [u'1,שקרכלשהו,22.34,text field']
    column_names = ['int_feature', 'unicode_feature',
                    'float_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1], dtype=np.int64),
         'unicode_feature': np.array([u'שקרכלשהו'.encode('utf-8')],
                                     dtype=np.object),
         'float_feature': np.array([22.34], dtype=np.float32),
         'str_feature': np.array([b'text field'], dtype=np.object)}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
  def test_csv_decoder_missing_values(self):
    input_lines = ['1,,hello',
                   ',12.34,']
    column_names = ['int_feature', 'float_feature', 'str_feature']
    expected_result = [
        {'int_feature': np.array([1.0], dtype=np.float32),
         'float_feature': None,
         'str_feature': np.array([b'hello'], dtype=np.object)},
        {'int_feature': None,
         'float_feature': np.array([12.34], dtype=np.float32),
         'str_feature': None}]

    with beam.Pipeline() as p:
      result = (p | beam.Create(input_lines) |
                csv_decoder.DecodeCSV(column_names=column_names))
      util.assert_that(
          result,
          test_util.make_example_dict_equal_fn(self, expected_result))
    def test_csv_decoder_empty_row(self):
        input_lines = [',,', '1,2.0,hello']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [{
            'int_feature': None,
            'float_feature': None,
            'str_feature': None
        }, {
            'int_feature': np.array([1.0], dtype=np.floating),
            'float_feature': np.array([2.0], dtype=np.floating),
            'str_feature': np.array(['hello'], dtype=np.object)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))
    def test_csv_decoder(self):
        input_lines = ['1,2.0,hello', '5,12.34,world']
        column_names = ['int_feature', 'float_feature', 'str_feature']
        expected_result = [{
            'int_feature': np.array([1], dtype=np.integer),
            'float_feature': np.array([2.0], dtype=np.floating),
            'str_feature': np.array([b'hello'], dtype=np.object)
        }, {
            'int_feature':
            np.array([5], dtype=np.integer),
            'float_feature':
            np.array([12.34], dtype=np.floating),
            'str_feature':
            np.array([b'world'], dtype=np.object)
        }]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names))
            util.assert_that(
                result,
                test_util.make_example_dict_equal_fn(self, expected_result))