def _CsvToExample( # pylint: disable=invalid-name pipeline, input_dict, exec_properties): # pylint: disable=unused-argument """Read CSV file and transform to TF examples. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input-base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. Returns: PCollection of TF examples. """ input_base = types.get_single_instance(input_dict['input-base']) input_base_uri = input_base.uri csv_uri = io_utils.get_only_uri_in_dir(input_base_uri) tf.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_uri)) return ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1) | 'ParseCSV' >> csv_decoder.DecodeCSV( io_utils.load_csv_column_names(csv_uri)) | 'ToTFExample' >> beam.Map(_dict_to_example))
def test_csv_decoder_with_schema(self): input_lines = ['1,1,2.0,hello', '5,5,12.34,world'] column_names = ['int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature'] schema = text_format.Parse( """ feature { name: "int_feature_parsed_as_float" type: FLOAT } feature { name: "int_feature" type: INT } feature { name: "float_feature" type: FLOAT } feature { name: "str_feature" type: BYTES } """, schema_pb2.Schema()) expected_result = [ {'int_feature_parsed_as_float': np.array([1], dtype=np.float32), 'int_feature': np.array([1], dtype=np.int64), 'float_feature': np.array([2.0], dtype=np.float32), 'str_feature': np.array([b'hello'], dtype=np.object)}, {'int_feature_parsed_as_float': np.array([5], dtype=np.float32), 'int_feature': np.array([5], dtype=np.int64), 'float_feature': np.array([12.34], dtype=np.float32), 'str_feature': np.array([b'world'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, schema=schema, infer_type_from_schema=True)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_schema(self): input_lines = ['1,1,2.0,hello', '5,5,12.34,world'] column_names = ['int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature'] schema = text_format.Parse( """ feature { name: "int_feature_parsed_as_float" type: FLOAT } feature { name: "int_feature" type: INT } feature { name: "float_feature" type: FLOAT } feature { name: "str_feature" type: BYTES } """, schema_pb2.Schema()) expected_result = [ pa.RecordBatch.from_arrays([ pa.array([[1], [5]], pa.list_(pa.float32())), pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())), ], [ 'int_feature_parsed_as_float', 'int_feature', 'float_feature', 'str_feature' ]) ] with beam.Pipeline() as p: result = ( p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV( column_names=column_names, schema=schema, infer_type_from_schema=True)) util.assert_that( result, test_util.make_arrow_record_batches_equal_fn(self, expected_result))
def _CsvToSerializedExample( # pylint: disable=invalid-name pipeline, csv_uri): """Read csv file and transform to tf examples.""" return (pipeline | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1) | 'ParseCSV' >> csv_decoder.DecodeCSV( io_utils.load_csv_column_names(csv_uri)) | 'ToSerializedTFExample' >> beam.Map(_dict_to_example))
def test_csv_decoder_empty_csv(self): input_lines = [] expected_result = [] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=[])) util.assert_that( result, _make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_skip_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['int_feature'] expected_result = [{'int_feature': np.array([1], dtype=np.integer)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, _make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_empty_csv(self): input_lines = [] expected_result = [] with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=[])) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_invalid_row(self): input_lines = ['1,2.0,hello', '5,12.34'] column_names = ['int_feature', 'float_feature', 'str_feature'] with self.assertRaisesRegexp( ValueError, '.*Columns do not match specified csv headers.*'): with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, None))
def test_csv_decoder_invalid_row(self): input_lines = ['1,2.0,hello', '5,12.34'] column_names = ['int_feature', 'float_feature', 'str_feature'] with self.assertRaisesRegex( # pylint: disable=g-error-prone-assert-raises ValueError, '.*Columns do not match specified csv headers.*'): with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_record_batches_equal_fn(self, None))
def test_csv_decoder_negative_values(self): input_lines = ['-34', '45'] column_names = ['feature'] expected_result = [ {'feature': np.array([-34], dtype=np.int64)}, {'feature': np.array([45], dtype=np.int64)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_int64_max(self): input_lines = ['34', str(sys.maxsize)] column_names = ['feature'] expected_result = [ {'feature': np.array([34], dtype=np.int64)}, {'feature': np.array([sys.maxsize], dtype=np.int64)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def compute_stats(input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ with beam.Pipeline(argv=pipeline_args) as pipeline: if input_handle.lower().endswith('csv'): raw_data = (pipeline | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=input_handle, skip_header_lines=1) | 'DecodeData' >> csv_decoder.DecodeCSV( column_names=taxi.CSV_COLUMN_NAMES)) else: query = taxi.make_sql(table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: { key: np.asarray([x[key]]) # pylint: disable=g-long-lambda for key in x if x[key] is not None })) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: raw_data |= ('BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables()) _ = (raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList)))
def test_csv_decoder_consider_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['float_feature'] expected_result = [{ 'float_feature': None }, { 'float_feature': np.array([1.0], dtype=np.floating) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, skip_blank_lines=False)) util.assert_that( result, _make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_consider_blank_line_single_column(self): input_lines = ['', '1'] column_names = ['int_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([None, [1]], pa.list_(pa.int64())), ], ['int_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, skip_blank_lines=False)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_int64_max(self): input_lines = ['34', str(sys.maxsize)] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[34], [sys.maxsize]], pa.list_(pa.int64())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_negative_values(self): input_lines = ['-34', '45'] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[-34], [45]], pa.list_(pa.int64())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_large_int_categorical_neg(self): input_lines = ['34', str(-(sys.maxsize + 2))] column_names = ['feature'] expected_result = [{ 'feature': np.array(['34'], dtype=np.object) }, { 'feature': np.array([str(-(sys.maxsize + 2))], dtype=np.object) }] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, _make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_skip_blank_line(self): input_lines = ['', '1,2'] column_names = ['int_feature1', 'int_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[1]], pa.list_(pa.int64())), pa.array([[2]], pa.list_(pa.int64())), ], ['int_feature1', 'int_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_int_and_float_in_same_column(self): input_lines = ['2,1.5', '1.5,2'] column_names = ['float_feature1', 'float_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[2.0], [1.5]], pa.list_(pa.float32())), pa.array([[1.5], [2.0]], pa.list_(pa.float32())), ], ['float_feature1', 'float_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_tab_delimiter(self): input_lines = ['1\t"this is a \ttext"', '5\t'] column_names = ['int_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[b'this is a \ttext'], None], pa.list_(pa.binary())), ], ['int_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV( column_names=column_names, delimiter='\t')) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_float_and_string_in_same_column(self): input_lines = ['2.3,abc', 'abc,2.3'] column_names = ['str_feature1', 'str_feature2'] expected_result = [ pa.Table.from_arrays([ pa.array([[b'2.3'], [b'abc']], pa.list_(pa.binary())), pa.array([[b'abc'], [b'2.3']], pa.list_(pa.binary())), ], ['str_feature1', 'str_feature2']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_csv_record_with_quotes(self): input_lines = ['1,"ab,cd,ef"', '5,"wx,xy,yz"'] column_names = ['int_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'str_feature': np.array([b'ab,cd,ef'], dtype=np.object)}, {'int_feature': np.array([5], dtype=np.int64), 'str_feature': np.array([b'wx,xy,yz'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_large_int_categorical_neg(self): input_lines = ['34', str(-(sys.maxsize + 2))] column_names = ['feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[b'34'], [str(-(sys.maxsize + 2)).encode('utf-8')]], pa.list_(pa.binary())), ], ['feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_with_float_and_string_in_same_column(self): input_lines = ['2.3,abc', 'abc,2.3'] column_names = ['str_feature1', 'str_feature2'] expected_result = [ {'str_feature1': np.array([b'2.3'], dtype=np.object), 'str_feature2': np.array([b'abc'], dtype=np.object)}, {'str_feature1': np.array([b'abc'], dtype=np.object), 'str_feature2': np.array([b'2.3'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_int_and_float_in_same_column(self): input_lines = ['2,1.5', '1.5,2'] column_names = ['float_feature1', 'float_feature2'] expected_result = [ {'float_feature1': np.array([2.0], dtype=np.float32), 'float_feature2': np.array([1.5], dtype=np.float32)}, {'float_feature1': np.array([1.5], dtype=np.float32), 'float_feature2': np.array([2.0], dtype=np.float32)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_missing_values(self): input_lines = ['1,,hello', ',12.34,'] column_names = ['int_feature', 'float_feature', 'str_feature'] expected_result = [ pa.Table.from_arrays([ pa.array([[1], None], pa.list_(pa.int64())), pa.array([None, [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], None], pa.list_(pa.binary())), ], ['int_feature', 'float_feature', 'str_feature']) ] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_tables_equal_fn(self, expected_result))
def test_csv_decoder_large_int_categorical_pos(self): input_lines = ['34', str(sys.maxsize+1)] column_names = ['feature'] expected_result = [ pa.RecordBatch.from_arrays([ pa.array([[b'34'], [str(sys.maxsize + 1).encode('utf-8')]], pa.list_(pa.binary())), ], ['feature']) ] with beam.Pipeline() as p: result = ( p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_arrow_record_batches_equal_fn(self, expected_result))
def test_csv_decoder_with_tab_delimiter(self): input_lines = ['1\t"this is a \ttext"', '5\t'] column_names = ['int_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'str_feature': np.array([b'this is a \ttext'], dtype=np.object)}, {'int_feature': np.array([5], dtype=np.int64), 'str_feature': None}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names, delimiter='\t')) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def test_csv_decoder_with_unicode(self): input_lines = [u'1,שקרכלשהו,22.34,text field'] column_names = ['int_feature', 'unicode_feature', 'float_feature', 'str_feature'] expected_result = [ {'int_feature': np.array([1], dtype=np.int64), 'unicode_feature': np.array([u'שקרכלשהו'.encode('utf-8')], dtype=np.object), 'float_feature': np.array([22.34], dtype=np.float32), 'str_feature': np.array([b'text field'], dtype=np.object)}] with beam.Pipeline() as p: result = (p | beam.Create(input_lines) | csv_decoder.DecodeCSV(column_names=column_names)) util.assert_that( result, test_util.make_example_dict_equal_fn(self, expected_result))
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = types.get_single_uri(input_dict['input_base']) csv_pattern = os.path.join(input_base_uri, split_pattern) tf.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_pattern)) csv_files = tf.gfile.Glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_files in csv_files[1:]: if io_utils.load_csv_column_names(csv_files) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) return (pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSV' >> csv_decoder.DecodeCSV(column_names) | 'ToTFExample' >> beam.Map(_dict_to_example))