Example #1
0
    def test_parse_csv_lines(self,
                             input_lines,
                             column_names,
                             expected_csv_cells,
                             expected_types,
                             skip_blank_lines=False,
                             delimiter=','):
        def _check_csv_cells(actual):
            self.assertEqual(expected_csv_cells, actual)

        def _check_types(actual):
            self.assertLen(actual, 1)
            self.assertCountEqual([
                csv_decoder.ColumnInfo(n, t)
                for n, t in zip(column_names, expected_types)
            ], actual[0])

        with beam.Pipeline() as p:
            parsed_csv_cells = (
                p | beam.Create(input_lines, reshuffle=False)
                | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=delimiter)))
            inferred_types = parsed_csv_cells | beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(
                    column_names, skip_blank_lines=skip_blank_lines))

            beam_test_util.assert_that(parsed_csv_cells,
                                       _check_csv_cells,
                                       label='check_parsed_csv_cells')
            beam_test_util.assert_that(inferred_types,
                                       _check_types,
                                       label='check_types')
Example #2
0
    def expand(self, lines: beam.pvalue.PCollection):
        """Decodes the input CSV records into an in-memory dict representation.

    Args:
      lines: A PCollection of strings representing the lines in the CSV file.

    Returns:
      A PCollection of dicts representing the CSV records.
    """
        csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo(
            csv_decoder.ParseCSVLine(self._delimiter)))

        if self._infer_type_from_schema:
            column_infos = _get_feature_types_from_schema(
                self._schema, self._column_names)
        else:
            # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
            # Do first pass to infer the feature types.
            column_infos = beam.pvalue.AsSingleton(
                csv_lines | 'InferColumnTypes' >> beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(
                        column_names=self._column_names,
                        skip_blank_lines=self._skip_blank_lines)))

        # Do second pass to generate the in-memory dict representation.
        return (
            csv_lines
            | 'BatchCSVLines' >> beam.BatchElements(
                **batch_util.GetBeamBatchKwargs(self._desired_batch_size))
            | 'BatchedCSVRowsToArrow' >> beam.ParDo(
                _BatchedCSVRowsToArrow(
                    skip_blank_lines=self._skip_blank_lines), column_infos))
Example #3
0
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    absl.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Example #4
0
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
        split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    logging.info('Processing input csv data %s to TFExample.', csv_pattern)

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_file in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_file) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    # TODO(b/155997704) clean this up once tfx_bsl makes a release.
    if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
        # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
        # we only want the parsed_lines.
        parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Example #5
0
def read_files_from_disk(pipeline: beam.Pipeline,
                         base_path: Text) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a collection of CSV files
    on a local file system.
    Args:
        pipeline: Input beam.Pipeline object coming from a TFX Executor.
        base_path: Base path pointing either to the directory containing the
         CSV files, or to a (single) CSV file.

    Returns:
        A beam.PCollection of data points. Each row in the collection of
         CSV files represents a single data point.

    """
    wildcard_qualifier = "*"
    file_pattern = os.path.join(base_path, wildcard_qualifier)

    if path_utils.is_dir(base_path):
        csv_files = path_utils.list_dir(base_path)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    file_pattern))
    else:
        if path_utils.file_exists(base_path):
            csv_files = [base_path]
        else:
            raise RuntimeError(f'{base_path} does not exist.')

    # weed out bad file exts with this logic
    allowed_file_exts = [".csv", ".txt"]  # ".dat"
    csv_files = [
        uri for uri in csv_files
        if os.path.splitext(uri)[1] in allowed_file_exts
    ]

    logger.info(f'Matched {len(csv_files)}: {csv_files}')

    # Always use header from file
    logger.info(f'Using header from file: {csv_files[0]}.')
    column_names = path_utils.load_csv_header(csv_files[0])
    logger.info(f'Header: {column_names}.')

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=base_path,
                                                 skip_header_lines=1)
        | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
        | 'ExtractParsedCSVLines' >>
        beam.Map(lambda x: dict(zip(column_names, x[0]))))

    return parsed_csv_lines
Example #6
0
 def test_invalid_row(self):
     input_lines = ['1,2.0,hello', '5,12.34']
     column_names = ['int_feature', 'float_feature', 'str_feature']
     with self.assertRaisesRegexp(
             ValueError, '.*Columns do not match specified csv headers.*'):
         with beam.Pipeline() as p:
             result = (p | beam.Create(input_lines, reshuffle=False)
                       | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
                       | beam.CombineGlobally(
                           csv_decoder.ColumnTypeInferrer(
                               column_names, skip_blank_lines=False)))
             beam_test_util.assert_that(result, lambda _: None)
Example #7
0
    def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path):
        """Runs a Beam pipeline to convert the CSV file into a TFRecords file.

    This is needed because the conversion is orders of magnitude more
    time-consuming than the functions we want to benchmark, so instead of
    doing the conversion each time, we do it once to generate a converted
    dataset and use that for the benchmark instead.

    Args:
      csv_path: Path to CSV file containing examples.
      tfrecords_output_path: Path to output TFRecords file containing parsed
        examples.
    """
        # Copied from CSV example gen.
        fp = open(csv_path, "r")
        column_names = next(fp).strip().split(",")
        fp.close()

        with beam.Pipeline() as p:
            parsed_csv_lines = (p
                                | "ReadFromText" >> beam.io.ReadFromText(
                                    file_pattern=csv_path, skip_header_lines=1)
                                | "ParseCSVLine" >> beam.ParDo(
                                    csv_decoder.ParseCSVLine(delimiter=",")))
            # TODO(b/155997704) clean this up once tfx_bsl makes a release.
            if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS",
                       False):
                # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
                # we only want the parsed_lines.
                parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys()

            column_infos = beam.pvalue.AsSingleton(
                parsed_csv_lines
                | "InferColumnTypes" >> beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(column_names,
                                                   skip_blank_lines=True)))
            _ = (
                parsed_csv_lines
                | "ToTFExample" >> beam.ParDo(
                    csv_exgen._ParsedCsvToTfExample(),  # pylint: disable=protected-access
                    column_infos)
                | "Serialize" >> beam.Map(lambda x: x.SerializeToString())
                | "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=tfrecords_output_path,
                    shard_name_template="",
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
Example #8
0
    def expand(
            self, pipeline: beam.Pipeline
    ) -> beam.pvalue.PCollection[tf.train.Example]:
        logging.info('Processing input csv data %s to TFExample.',
                     self._csv_pattern)

        csv_files = fileio.glob(self._csv_pattern)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    self._csv_pattern))

        column_names = io_utils.load_csv_column_names(csv_files[0])
        for csv_file in csv_files[1:]:
            if io_utils.load_csv_column_names(csv_file) != column_names:
                raise RuntimeError(
                    'Files in same split {} have different header.'.format(
                        self._csv_pattern))

        # Read each CSV file while maintaining order. This is done in order to group
        # together multi-line string fields.
        parsed_csv_lines = (
            pipeline
            | 'CreateFilenames' >> beam.Create(csv_files)
            | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile())
            | 'ParseCSVLine' >> beam.ParDo(
                csv_decoder.ParseCSVLine(delimiter=','))
            | 'ExtractParsedCSVLines' >> beam.Keys())
        column_infos = beam.pvalue.AsSingleton(
            parsed_csv_lines
            | 'InferColumnTypes' >> beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(column_names,
                                               skip_blank_lines=True)))

        return (
            parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Example #9
0
    def test_parse_csv_lines(self,
                             input_lines,
                             column_names,
                             expected_csv_cells,
                             expected_types,
                             expected_record_batch,
                             skip_blank_lines=False,
                             schema=None,
                             delimiter=',',
                             multivalent_columns=None,
                             secondary_delimiter=None,
                             raw_record_column_name=None):
        def _check_csv_cells(actual):
            for i in range(len(actual)):
                self.assertEqual(expected_csv_cells[i], actual[i][0])
                self.assertEqual(input_lines[i], actual[i][1])

        def _check_types(actual):
            self.assertLen(actual, 1)
            self.assertCountEqual([
                csv_decoder.ColumnInfo(n, t)
                for n, t in zip(column_names, expected_types)
            ], actual[0])

        def _check_record_batches(actual):
            """Compares a list of pa.RecordBatch."""
            if actual:
                self.assertTrue(actual[0].equals(expected_record_batch))
            else:
                self.assertEqual(expected_record_batch, actual)

        def _check_arrow_schema(actual):
            for record_batch in actual:
                expected_arrow_schema = csv_decoder.GetArrowSchema(
                    column_names, schema, raw_record_column_name)
                self.assertEqual(record_batch.schema, expected_arrow_schema)

        with beam.Pipeline() as p:
            parsed_csv_cells_and_raw_records = (
                p | beam.Create(input_lines, reshuffle=False)
                | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=delimiter)))
            inferred_types = (
                parsed_csv_cells_and_raw_records
                | beam.Keys()
                | beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(
                        column_names,
                        skip_blank_lines=skip_blank_lines,
                        multivalent_columns=multivalent_columns,
                        secondary_delimiter=secondary_delimiter)))

            beam_test_util.assert_that(parsed_csv_cells_and_raw_records,
                                       _check_csv_cells,
                                       label='check_parsed_csv_cells')
            beam_test_util.assert_that(inferred_types,
                                       _check_types,
                                       label='check_types')

            record_batches = (
                parsed_csv_cells_and_raw_records
                | beam.BatchElements(min_batch_size=1000) | beam.ParDo(
                    csv_decoder.BatchedCSVRowsToRecordBatch(
                        skip_blank_lines=skip_blank_lines,
                        multivalent_columns=multivalent_columns,
                        secondary_delimiter=secondary_delimiter,
                        raw_record_column_name=raw_record_column_name),
                    beam.pvalue.AsSingleton(inferred_types)))
            beam_test_util.assert_that(record_batches,
                                       _check_record_batches,
                                       label='check_record_batches')
            if schema:
                beam_test_util.assert_that(record_batches,
                                           _check_arrow_schema,
                                           label='check_arrow_schema')

        # Testing CSVToRecordBatch
        with beam.Pipeline() as p:
            record_batches = (
                p
                | 'CreatingPColl' >> beam.Create(input_lines, reshuffle=False)
                | 'CSVToRecordBatch' >> csv_decoder.CSVToRecordBatch(
                    column_names=column_names,
                    delimiter=delimiter,
                    skip_blank_lines=skip_blank_lines,
                    desired_batch_size=1000,
                    schema=schema,
                    multivalent_columns=multivalent_columns,
                    secondary_delimiter=secondary_delimiter,
                    raw_record_column_name=raw_record_column_name))
            beam_test_util.assert_that(record_batches,
                                       _check_record_batches,
                                       label='check_record_batches')