コード例 #1
0
  def expand(self, uri_to_content):

    # Compute the total number of documents, and prepare a singleton
    # PCollection to use as side input.
    total_documents = (
        uri_to_content
        | 'GetUris 1' >> beam.Keys()
        | 'GetUniqueUris' >> beam.RemoveDuplicates()
        | 'CountUris' >> beam.combiners.Count.Globally())

    # Create a collection of pairs mapping a URI to each of the words
    # in the document associated with that that URI.

    def split_into_words((uri, line)):
      return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

    uri_to_words = (
        uri_to_content
        | 'SplitWords' >> beam.FlatMap(split_into_words))

    # Compute a mapping from each word to the total number of documents
    # in which it appears.
    word_to_doc_count = (
        uri_to_words
        | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates()
        | 'GetWords' >> beam.Values()
        | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

    # Compute a mapping from each URI to the total number of words in the
    # document associated with that URI.
    uri_to_word_total = (
        uri_to_words
        | 'GetUris 2' >> beam.Keys()
        | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

    # Count, for each (URI, word) pair, the number of occurrences of that word
    # in the document associated with the URI.
    uri_and_word_to_count = (
        uri_to_words
        | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

    # Adjust the above collection to a mapping from (URI, word) pairs to counts
    # into an isomorphic mapping from URI to (word, count) pairs, to prepare
    # for a join by the URI key.
    uri_to_word_and_count = (
        uri_and_word_to_count
        | 'ShiftKeys' >> beam.Map(
            lambda ((uri, word), count): (uri, (word, count))))
コード例 #2
0
    def expand(
        self, sliced_record_batchs: beam.pvalue.PCollection
    ) -> beam.pvalue.PCollection:
        # Compute P(Y=y)
        # _SlicedYKey(slice, y), _YRate(y_count, example_count)
        y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
            self._y_path, self._y_boundaries, self._weight_column_name)
        y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()

        # Compute P(Y=y | X=x)
        # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
        conditional_y_rates = (
            (sliced_record_batchs, y_keys)
            | 'GetConditionalYRates' >> _GetConditionalYRates(
                self._y_path, self._y_boundaries, self._x_paths,
                self._min_x_count, self._weight_column_name))

        return ({
            'conditional_y_rate': conditional_y_rates,
            'y_rate': y_rates
        }
                | 'CoGroupByForLift' >> beam.CoGroupByKey()
                | 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
                | 'FilterLifts' >> _FilterLifts(self._top_k_per_y,
                                                self._bottom_k_per_y)
                | 'GroupLiftsForOutput' >> beam.GroupByKey()
                | 'MakeProtos' >> beam.Map(
                    _make_dataset_feature_stats_proto, self._y_path,
                    self._y_boundaries, self._weight_column_name is not None,
                    self._output_custom_stats))
コード例 #3
0
    def test_records_traverse_transform_with_mocks(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            coder=CustomRowCoder())

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> beam.combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | beam.combiners.Count.Globally(),
                        equal_to([1]),
                        label='CountFiles')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
コード例 #4
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the user_score pipeline."""
    parser = argparse.ArgumentParser()

    # The default maps to two large Google Cloud Storage files (each ~12GB)
    # holding two subsequent day's worth (roughly) of data.
    parser.add_argument('--input',
                        type=str,
                        required=True,
                        help='Path to the log bucket')

    parser.add_argument('--output',
                        type=str,
                        required=True,
                        help='Path to the output file(s).')

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=options) as p:
        (  # pylint: disable=expression-not-assigned
            p
            | 'ReadInputText' >> beam.io.ReadFromText(args.input)
            | 'ParseLogEntry' >> beam.ParDo(ParseLogEntry())
            | 'Group' >> beam.GroupByKey()
            | 'getKeys' >> beam.Keys()
            | 'Write' >> beam.io.WriteToText(args.output))
コード例 #5
0
def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    num_bootstrap_samples: Optional[int] = 1,
    random_seed_for_testing: Optional[int] = None
) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]:
    """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Set to value > 1 to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    Tuple of Tuple[PCollection of (slice key, metrics),
    PCollection of (slice key, plot metrics)] and
    PCollection of (slice_key and its example count).
  """

    _ = (extracts.pipeline
         | counter_util.IncrementMetricsComputationCounters(
             eval_shared_model.add_metrics_callbacks))

    # pylint: disable=no-value-for-parameter
    slices = (
        extracts

        # Input: one example at a time, with slice keys in extracts.
        # Output: one fpl example per slice key (notice that the example turns
        #         into n logical examples, references to which are replicated once
        #         per applicable slice key).
        | 'FanoutSlices' >> slicer.FanoutSlices())

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    aggregated_metrics = (
        slices
        # Metrics are computed per slice key.
        # Output: Multi-outputs, a dict of slice key to computed metrics, and
        # plots if applicable.
        | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
            eval_shared_model=eval_shared_model,
            desired_batch_size=desired_batch_size,
            num_bootstrap_samples=num_bootstrap_samples,
            random_seed_for_testing=random_seed_for_testing))
    return (aggregated_metrics, slices_count)
コード例 #6
0
ファイル: executor.py プロジェクト: meixinzhang/tfx
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
        split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    logging.info('Processing input csv data %s to TFExample.', csv_pattern)

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_file in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_file) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    # TODO(b/155997704) clean this up once tfx_bsl makes a release.
    if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
        # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
        # we only want the parsed_lines.
        parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
コード例 #7
0
    def _add_metadata(
            self, rows: beam.pvalue.PCollection[Row]
    ) -> beam.pvalue.PCollection[Row]:
        """Add ip metadata to a collection of roundtrip rows.

    Args:
      rows: beam.PCollection[Row]

    Returns:
      PCollection[Row]
      The same rows as above with with additional metadata columns added.
    """

        # PCollection[Tuple[DateIpKey,Row]]
        rows_keyed_by_ip_and_date = (
            rows
            | 'key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key(
                row), row)).with_output_types(Tuple[DateIpKey, Row]))

        # PCollection[DateIpKey]
        # pylint: disable=no-value-for-parameter
        ips_and_dates = (rows_keyed_by_ip_and_date
                         | 'get ip and date keys per row' >>
                         beam.Keys().with_output_types(DateIpKey))

        # PCollection[DateIpKey]
        deduped_ips_and_dates = (
            # pylint: disable=no-value-for-parameter
            ips_and_dates
            | 'dedup' >> beam.Distinct().with_output_types(DateIpKey))

        # PCollection[Tuple[date,List[ip]]]
        grouped_ips_by_dates = (
            deduped_ips_and_dates | 'group by date' >>
            beam.GroupByKey().with_output_types(Tuple[str, Iterable[str]]))

        # PCollection[Tuple[DateIpKey,Row]]
        ips_with_metadata = (grouped_ips_by_dates
                             | 'get ip metadata' >> beam.FlatMapTuple(
                                 self._add_ip_metadata).with_output_types(
                                     Tuple[DateIpKey, Row]))

        # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]]
        grouped_metadata_and_rows = (
            ({
                IP_METADATA_PCOLLECTION_NAME: ips_with_metadata,
                ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date
            }) | 'group by keys' >> beam.CoGroupByKey())

        # PCollection[Row]
        rows_with_metadata = (
            grouped_metadata_and_rows | 'merge metadata with rows' >>
            beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row))

        return rows_with_metadata
コード例 #8
0
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.PCollection[types.SlicedRecordBatch]
) -> beam.pvalue.PCollection[int]:
  """Gathers slice key telemetry post slicing."""

  return (slice_keys_and_values
          | 'ExtractSliceKeys' >> beam.Keys()
          | 'RemoveDuplicates' >> beam.Distinct()
          | 'Size' >> beam.combiners.Count.Globally()
          | 'IncrementCounter' >> beam.Map(
              lambda x: _increment_counter('num_distinct_slice_keys', x)))
コード例 #9
0
 def test_invalid_row(self):
     input_lines = ['1,2.0,hello', '5,12.34']
     column_names = ['int_feature', 'float_feature', 'str_feature']
     with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
             ValueError, '.*Columns do not match specified csv headers.*'):
         with beam.Pipeline() as p:
             result = (p | beam.Create(input_lines, reshuffle=False)
                       | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
                       | beam.Keys()
                       | beam.CombineGlobally(
                           csv_decoder.ColumnTypeInferrer(
                               column_names, skip_blank_lines=False)))
             beam_test_util.assert_that(result, lambda _: None)
コード例 #10
0
def _TrackDistinctSliceKeys(  # pylint: disable=invalid-name
    slice_keys_and_values: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    """Gathers slice key telemetry post slicing."""
    def increment_counter(element):  # pylint: disable=invalid-name
        num_distinct_slice_keys = beam.metrics.Metrics.counter(
            constants.METRICS_NAMESPACE, 'num_distinct_slice_keys')
        num_distinct_slice_keys.inc(element)
        return element

    return (slice_keys_and_values
            | 'ExtractSliceKeys' >> beam.Keys()
            | 'RemoveDuplicates' >> beam.Distinct()
            | 'Size' >> beam.combiners.Count.Globally()
            | 'IncrementCounter' >> beam.Map(increment_counter))
コード例 #11
0
def IncrementSliceSpecCounters(pipeline: beam.Pipeline):
    """To track count of all slicing spec computed using TFMA."""
    def _MakeAndIncrementCounters(slice_list):
        for slice_key, slice_value in slice_list:
            # LINT.IfChange
            slice_name = 'slice_computed_%s_%s' % (slice_key, slice_value)
            # LINT.ThenChange(../../../../learning/fairness/infra/plx/scripts/tfma_metrics_computed_tracker_macros.sql)
            slice_counter = beam.metrics.Metrics.counter(
                constants.METRICS_NAMESPACE, slice_name)
            slice_counter.inc(1)

    return (pipeline
            | 'GetSliceCountKeys' >> beam.Keys()
            | 'Count' >> beam.Map(_MakeAndIncrementCounters))
コード例 #12
0
def prevent_fusion(collection):
    '''
    Wraps the provided collection with a 3-step process to prevent fusion:
    https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#fusion-optimization
    1. Wraps with a dummy tuple (we only care about the key and are throwing away the value)
    2. Groups by key
    3. Throw away value

    The key is that this prevents Dataflow from optimizing stuff such that we end up with a serial
    process.

    :param collection: PCollection
    :return: PCollection
    '''
    return collection | 'Prevent Fusion -- Dummy' >> beam.ParDo(PairWithOne()) \
      | 'Prevent Fusion -- Group' >> beam.GroupByKey() \
      | 'Prevent Fusion -- Split' >> beam.Keys() \
コード例 #13
0
ファイル: dataset.py プロジェクト: tinally/tfx
    def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path):
        """Runs a Beam pipeline to convert the CSV file into a TFRecords file.

    This is needed because the conversion is orders of magnitude more
    time-consuming than the functions we want to benchmark, so instead of
    doing the conversion each time, we do it once to generate a converted
    dataset and use that for the benchmark instead.

    Args:
      csv_path: Path to CSV file containing examples.
      tfrecords_output_path: Path to output TFRecords file containing parsed
        examples.
    """
        # Copied from CSV example gen.
        fp = open(csv_path, "r")
        column_names = next(fp).strip().split(",")
        fp.close()

        with beam.Pipeline() as p:
            parsed_csv_lines = (p
                                | "ReadFromText" >> beam.io.ReadFromText(
                                    file_pattern=csv_path, skip_header_lines=1)
                                | "ParseCSVLine" >> beam.ParDo(
                                    csv_decoder.ParseCSVLine(delimiter=",")))
            # TODO(b/155997704) clean this up once tfx_bsl makes a release.
            if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS",
                       False):
                # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
                # we only want the parsed_lines.
                parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys()

            column_infos = beam.pvalue.AsSingleton(
                parsed_csv_lines
                | "InferColumnTypes" >> beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(column_names,
                                                   skip_blank_lines=True)))
            _ = (
                parsed_csv_lines
                | "ToTFExample" >> beam.ParDo(
                    csv_exgen._ParsedCsvToTfExample(),  # pylint: disable=protected-access
                    column_infos)
                | "Serialize" >> beam.Map(lambda x: x.SerializeToString())
                | "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=tfrecords_output_path,
                    shard_name_template="",
                    compression_type=beam.io.filesystem.CompressionTypes.GZIP))
コード例 #14
0
ファイル: keys.py プロジェクト: spellbring/beam
def keys(test=None):
    # [START keys]
    import apache_beam as beam

    with beam.Pipeline() as pipeline:
        icons = (pipeline
                 | 'Garden plants' >> beam.Create([
                     ('🍓', 'Strawberry'),
                     ('🥕', 'Carrot'),
                     ('🍆', 'Eggplant'),
                     ('🍅', 'Tomato'),
                     ('🥔', 'Potato'),
                 ])
                 | 'Keys' >> beam.Keys()
                 | beam.Map(print))
        # [END keys]
        if test:
            test(icons)
コード例 #15
0
def filter_by_std_dev(pipeline, file_path):
    with pipeline as p:

        def std_dev_map(entry, means_and_counts):
            # Creating variables probably slows the pipeline a lot
            # but keep it for readability for now.
            ip, entries = entry
            mean = means_and_counts[ip][0]
            count = means_and_counts[ip][1]
            # Use numpy for array-wise operations
            # TODO: Loading python for variance is probably not the best way to go
            # TODO: Feels like this can be a separate CombinePerKey
            variance = np.sum(np.square(np.array(entries) - mean)) / count
            return ip, np.sqrt(variance if variance > 0 else 0)

        def filter_by_stddev(entry, means_and_counts, std_devs):
            # Refraining from creating new variables during pipeline
            return entry[1] > \
                   (means_and_counts[entry[0]][0] + std_devs[entry[0]])

        fields = (p
                  | 'ReadInputText' >> beam.io.ReadFromText(file_path)
                  | 'ParseLogs' >> NetworkUsage())

        # Combine mean and count to a single pipeline
        mean_and_count_per_key = beam.pvalue.AsDict(
            fields
            | 'GetMeanAndCountPerKey' >> beam.CombinePerKey(MeanAndCount()))

        # TODO: This part needs improvements
        std_dev_per_key = beam.pvalue.AsDict(
            fields
            | 'GroupByKey' >> beam.GroupByKey()
            | 'GetDifferencesPerKey' >> beam.Map(std_dev_map,
                                                 mean_and_count_per_key))

        return (fields
                | 'FilterZeroBytes' >> beam.Filter(lambda entry: entry[1] > 0)
                | 'FilterByStdDev' >> beam.Filter(
                    filter_by_stddev, mean_and_count_per_key, std_dev_per_key)
                | 'GetIPs' >> beam.Keys()
                | 'GetDistinctIPs' >> beam.Distinct())
コード例 #16
0
 def expand(self, pcoll):
   """Computes number of unique values for string features."""
   # Count the number of appearance of each feature_value. Output is a
   # pcollection of DatasetFeatureStatistics protos
   return (
       pcoll
       | 'Uniques_FilterIrrelevantFeatures' >>
       (beam.FlatMap(self._filter_irrelevant_features).with_output_types(
           beam.typehints.KV[types.BeamFeatureName, np.ndarray]))
       | 'Uniques_FlattenToFeatureNameValueTuples' >>
       beam.FlatMap(lambda name_and_value_list:  # pylint: disable=g-long-lambda
                    [(name_and_value_list[0], value)
                     for value in name_and_value_list[1]])
       | 'Uniques_RemoveDuplicateFeatureNameValueTuples' >>
       beam.RemoveDuplicates()
       # Drop the values to only have the feature_name with each repeated the
       # number of unique values times.
       | 'Uniques_DropValues' >> beam.Keys()
       | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement()
       | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
           _make_dataset_feature_stats_proto_with_single_feature,
           categorical_features=self._categorical_features))
コード例 #17
0
ファイル: executor.py プロジェクト: jay90099/tfx
    def expand(
            self, pipeline: beam.Pipeline
    ) -> beam.pvalue.PCollection[tf.train.Example]:
        logging.info('Processing input csv data %s to TFExample.',
                     self._csv_pattern)

        csv_files = fileio.glob(self._csv_pattern)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    self._csv_pattern))

        column_names = io_utils.load_csv_column_names(csv_files[0])
        for csv_file in csv_files[1:]:
            if io_utils.load_csv_column_names(csv_file) != column_names:
                raise RuntimeError(
                    'Files in same split {} have different header.'.format(
                        self._csv_pattern))

        # Read each CSV file while maintaining order. This is done in order to group
        # together multi-line string fields.
        parsed_csv_lines = (
            pipeline
            | 'CreateFilenames' >> beam.Create(csv_files)
            | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile())
            | 'ParseCSVLine' >> beam.ParDo(
                csv_decoder.ParseCSVLine(delimiter=','))
            | 'ExtractParsedCSVLines' >> beam.Keys())
        column_infos = beam.pvalue.AsSingleton(
            parsed_csv_lines
            | 'InferColumnTypes' >> beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(column_names,
                                               skip_blank_lines=True)))

        return (
            parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
コード例 #18
0
 def keys(self, col, stage_name: str):
     return col | stage_name >> beam.Keys()
コード例 #19
0
ファイル: csv_decoder.py プロジェクト: rose-rong-liu/tfx-bsl
def CSVToRecordBatch(lines: beam.pvalue.PCollection,
                     column_names: List[Text],
                     desired_batch_size: Optional[int],
                     delimiter: Text = ",",
                     skip_blank_lines: bool = True,
                     schema: Optional[schema_pb2.Schema] = None,
                     multivalent_columns: Optional[List[Union[Text,
                                                              bytes]]] = None,
                     secondary_delimiter: Optional[Union[Text, bytes]] = None,
                     raw_record_column_name: Optional[Text] = None,
                     produce_large_types: bool = False):
  """Decodes CSV records into Arrow RecordBatches.

  Args:
    lines: The pcollection of raw records (csv lines).
    column_names: List of feature names. Order must match the order in the CSV
      file.
    desired_batch_size: Batch size. The output Arrow RecordBatches will have as
      many rows as the `desired_batch_size`. If None, the batch size is auto
      tuned by beam.
    delimiter: A one-character string used to separate fields.
    skip_blank_lines: A boolean to indicate whether to skip over blank lines
      rather than interpreting them as missing values.
    schema: An optional schema of the input data. If this is provided, it must
      contain all columns.
    multivalent_columns: Columns that can contain multiple values. If
      secondary_delimiter is provided, this must also be provided.
    secondary_delimiter: Delimiter used for parsing multivalent columns. If
      multivalent_columns is provided, this must also be provided.
    raw_record_column_name: Optional name for a column containing the raw csv
      lines. If this is None, then this column will not be produced. This will
      always be the last column in the record batch.
    produce_large_types: If True, will output record batches with columns that
      are large_list types.

  Returns:
    RecordBatches of the CSV lines.

  Raises:
    ValueError:
      * If the columns do not match the specified csv headers.
      * If the schema has invalid feature types.
      * If the schema does not contain all columns.
      * If raw_record_column_name exists in column_names
  """
  if (raw_record_column_name is not None and
      raw_record_column_name in column_names):
    raise ValueError(
        "raw_record_column_name: {} is already an existing column name. "
        "Please choose a different name.".format(raw_record_column_name))

  csv_lines_and_raw_records = (
      lines | "ParseCSVLines" >> beam.ParDo(ParseCSVLine(delimiter)))

  if schema is not None:
    column_infos = _GetColumnInfosFromSchema(schema, column_names)
  else:
    # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
    # Do first pass to infer the feature types.
    column_infos = beam.pvalue.AsSingleton(
        csv_lines_and_raw_records
        | "ExtractParsedCSVLines" >> beam.Keys()
        | "InferColumnTypes" >> beam.CombineGlobally(
            ColumnTypeInferrer(
                column_names=column_names,
                skip_blank_lines=skip_blank_lines,
                multivalent_columns=multivalent_columns,
                secondary_delimiter=secondary_delimiter)))

  # Do second pass to generate the RecordBatches.
  return (csv_lines_and_raw_records
          | "BatchCSVLines" >> beam.BatchElements(
              **batch_util.GetBatchElementsKwargs(desired_batch_size))
          | "BatchedCSVRowsToArrow" >> beam.ParDo(
              BatchedCSVRowsToRecordBatch(
                  skip_blank_lines=skip_blank_lines,
                  multivalent_columns=multivalent_columns,
                  secondary_delimiter=secondary_delimiter,
                  raw_record_column_name=raw_record_column_name,
                  produce_large_types=produce_large_types), column_infos))
コード例 #20
0
  def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection:

    def _sum_pairwise(
        iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int, float]]]
    ) -> Tuple[Union[int, float], Union[int, float]]:
      """Computes sum of counts and weights."""
      # We take advantage of the fact that constructing a np array from a list
      # is much faster as the length is known beforehand.
      if isinstance(iter_of_pairs, list):
        arr = np.array(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      else:
        arr = np.fromiter(
            iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)])
      return arr['c'].sum(), arr['w'].sum()

    has_any_weight = bool(self._example_weight_map.all_weight_features())
    if has_any_weight:
      sum_fn = _sum_pairwise
    else:
      # For non-weighted case, use sum combine fn over integers to allow Beam
      # to use Cython combiner.
      sum_fn = sum
    top_k_tuples_combined = (
        pcoll
        | 'ToTopKTuples' >> beam.FlatMap(
            _to_topk_tuples,
            bytes_features=self._bytes_features,
            categorical_features=self._categorical_features,
            example_weight_map=self._example_weight_map)
        | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn)
        | 'Rearrange' >> beam.MapTuple(lambda k, v: ((k[0], k[1]), (v, k[2]))))
    # (slice_key, feature_path_steps), (count_and_maybe_weight, value)

    top_k = top_k_tuples_combined
    if has_any_weight:
      top_k |= 'Unweighted_DropWeightsAndRearrange' >> beam.MapTuple(
          lambda k, v: (k, (v[0][0], v[1])))
      # (slice_key, feature_path_steps), (count, value)

    top_k = (
        top_k
        | 'Unweighted_TopK' >> beam.combiners.Top().PerKey(
            max(self._num_top_values, self._num_rank_histogram_buckets))
        | 'Unweighted_ToFeatureValueCount' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount(
                t[1], t[0]) for t in v])
            # pylint: enable=g-long-lambda
            )
        | 'Unweighted_ToProto' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (
                k[0],
                top_k_uniques_stats_util.
                make_dataset_feature_stats_proto_topk_single(
                    feature_path_tuple=k[1],
                    value_count_list=v,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=False,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets)
                )
            # pylint: enable=g-long-lambda
            ))
        # (slice_key, DatasetFeatureStatistics)

    uniques = (
        top_k_tuples_combined
        | 'Uniques_Keys' >> beam.Keys()
        | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement()
        | 'Uniques_ConvertToSingleFeatureStats' >> beam.MapTuple(
            # pylint: disable=g-long-lambda
            lambda k, v: (
                k[0],
                top_k_uniques_stats_util.
                make_dataset_feature_stats_proto_unique_single(
                    feature_path_tuple=k[1],
                    num_uniques=v,
                    categorical_features=self._categorical_features))
            # pylint: enable=g-long-lambda
            ))
        # (slice_key, DatasetFeatureStatistics)

    result_protos = [top_k, uniques]

    if has_any_weight:
      weighted_top_k = (
          top_k_tuples_combined
          | 'Weighted_DropCountsAndRearrange'
          >> beam.MapTuple(lambda k, v: (k, (v[0][1], v[1])))
          # (slice_key, feature), (weight, value)
          | 'Weighted_TopK' >> beam.combiners.Top().PerKey(
              max(self._num_top_values, self._num_rank_histogram_buckets))
          | 'Weighted_ToFeatureValueCount' >> beam.MapTuple(
              # pylint: disable=g-long-lambda
              lambda k, v: (k, [top_k_uniques_stats_util.FeatureValueCount(
                  t[1], t[0]) for t in v])
              # pylint: enable=g-long-lambda
              )
          | 'Weighted_ToProto' >> beam.MapTuple(
              # pylint: disable=g-long-lambda
              lambda k, v:
              (k[0],
               top_k_uniques_stats_util.
               make_dataset_feature_stats_proto_topk_single(
                   feature_path_tuple=k[1],
                   value_count_list=v,
                   categorical_features=self._categorical_features,
                   is_weighted_stats=True,
                   num_top_values=self._num_top_values,
                   frequency_threshold=self._weighted_frequency_threshold,
                   num_rank_histogram_buckets=self._num_rank_histogram_buckets
               ))
              # pylint: enable=g-long-lambda
              ))
      # (slice_key, DatasetFeatureStatistics)

      result_protos.append(weighted_top_k)

    return (result_protos
            | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten())
コード例 #21
0
ファイル: bigquery_file_loads_test.py プロジェクト: ziel/beam
    def test_multiple_partition_files(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = mock.Mock()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job
        bq_client.tables.Delete.return_value = None

        with TestPipeline('DirectRunner') as p:
            outputs = (p
                       | beam.Create(_ELEMENTS)
                       | bqfl.BigQueryBatchFileLoads(
                           destination,
                           custom_gcs_temp_location=self._new_tempdir(),
                           test_client=bq_client,
                           validate=False,
                           coder=CustomRowCoder(),
                           max_file_size=45,
                           max_partition_size=80,
                           max_files_per_partition=2))

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_load_jobs = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]
            dest_copy_jobs = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_COPY_JOBID_PAIRS]

            load_jobs = dest_load_jobs | "GetLoadJobs" >> beam.Map(
                lambda x: x[1])
            copy_jobs = dest_copy_jobs | "GetCopyJobs" >> beam.Map(
                lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | "CountFiles" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckFileCount')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(load_jobs
                        | "CountLoadJobs" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckLoadJobCount')
            assert_that(copy_jobs
                        | "CountCopyJobs" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckCopyJobCount')
コード例 #22
0
ファイル: tfidf.py プロジェクト: AfterShip/aftership-beam
    def expand(self, uri_to_content):

        # Compute the total number of documents, and prepare a singleton
        # PCollection to use as side input.
        total_documents = (uri_to_content
                           | 'GetUris 1' >> beam.Keys()
                           | 'GetUniqueUris' >> beam.Distinct()
                           | 'CountUris' >> beam.combiners.Count.Globally())

        # Create a collection of pairs mapping a URI to each of the words
        # in the document associated with that that URI.

        def split_into_words(uri_line):
            (uri, line) = uri_line
            return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

        uri_to_words = (uri_to_content
                        | 'SplitWords' >> beam.FlatMap(split_into_words))

        # Compute a mapping from each word to the total number of documents
        # in which it appears.
        word_to_doc_count = (
            uri_to_words
            | 'GetUniqueWordsPerDoc' >> beam.Distinct()
            | 'GetWords' >> beam.Values()
            | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

        # Compute a mapping from each URI to the total number of words in the
        # document associated with that URI.
        uri_to_word_total = (
            uri_to_words
            | 'GetUris 2' >> beam.Keys()
            | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

        # Count, for each (URI, word) pair, the number of occurrences of that word
        # in the document associated with the URI.
        uri_and_word_to_count = (
            uri_to_words
            | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

        # Adjust the above collection to a mapping from (URI, word) pairs to counts
        # into an isomorphic mapping from URI to (word, count) pairs, to prepare
        # for a join by the URI key.
        def shift_keys(uri_word_count):
            return (uri_word_count[0][0], (uri_word_count[0][1],
                                           uri_word_count[1]))

        uri_to_word_and_count = (uri_and_word_to_count
                                 | 'ShiftKeys' >> beam.Map(shift_keys))

        # Perform a CoGroupByKey (a sort of pre-join) on the prepared
        # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
        # 'word counts' strings. This yields a mapping from URI to a dictionary
        # that maps the above mentioned tag strings to an iterable containing the
        # word total for that URI and word and count respectively.
        #
        # A diagram (in which '[]' just means 'iterable'):
        #
        #   URI: {'word totals': [count],  # Total words within this URI's document.
        #         'word counts': [(word, count),  # Counts of specific words
        #                         (word, count),  # within this URI's document.
        #                         ... ]}
        uri_to_word_and_count_and_total = (
            {
                'word totals': uri_to_word_total,
                'word counts': uri_to_word_and_count
            }
            | 'CoGroupByUri' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, term frequency) pair for each
        # URI. A word's term frequency for a document is simply the number of times
        # that word occurs in the document divided by the total number of words in
        # the document.

        def compute_term_frequency(uri_count_and_total):
            (uri, count_and_total) = uri_count_and_total
            word_and_count = count_and_total['word counts']
            # We have an iterable for one element that we want extracted.
            [word_total] = count_and_total['word totals']
            for word, count in word_and_count:
                yield word, (uri, float(count) / word_total)

        word_to_uri_and_tf = (
            uri_to_word_and_count_and_total
            | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))

        # Compute a mapping from each word to its document frequency.
        # A word's document frequency in a corpus is the number of
        # documents in which the word appears divided by the total
        # number of documents in the corpus.
        #
        # This calculation uses a side input, a Dataflow-computed auxiliary value
        # presented to each invocation of our MapFn lambda. The second argument to
        # the function (called total---note that the first argument is a tuple)
        # receives the value we listed after the lambda in Map(). Additional side
        # inputs (and ordinary Python values, too) can be provided to MapFns and
        # DoFns in this way.
        def div_word_count_by_total(word_count, total):
            (word, count) = word_count
            return (word, float(count) / total)

        word_to_df = (
            word_to_doc_count
            | 'ComputeDocFrequencies' >> beam.Map(
                div_word_count_by_total, AsSingleton(total_documents)))

        # Join the term frequency and document frequency collections,
        # each keyed on the word.
        word_to_uri_and_tf_and_df = (
            {
                'tf': word_to_uri_and_tf,
                'df': word_to_df
            }
            | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())

        # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
        # There are a variety of definitions of TF-IDF
        # ("term frequency - inverse document frequency") score; here we use a
        # basic version that is the term frequency divided by the log of the
        # document frequency.

        def compute_tf_idf(word_tf_and_df):
            (word, tf_and_df) = word_tf_and_df
            [docf] = tf_and_df['df']
            for uri, tf in tf_and_df['tf']:
                yield word, (uri, tf * math.log(1 / docf))

        word_to_uri_and_tfidf = (
            word_to_uri_and_tf_and_df
            | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))

        return word_to_uri_and_tfidf
コード例 #23
0
    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job
        bq_client.jobs.Insert.return_value = result_job

        # Insert a fake clock to work with auto-sharding which needs a processing
        # time timer.
        class _FakeClock(object):
            def __init__(self, now=time.time()):
                self._now = now

            def __call__(self):
                return self._now

        start_time = timestamp.Timestamp(0)
        bq_client.test_clock = _FakeClock(now=start_time)

        triggering_frequency = 20 if is_streaming else None
        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            temp_file_format=bigquery_tools.FileFormat.JSON,
            is_streaming_pipeline=is_streaming,
            triggering_frequency=triggering_frequency,
            with_auto_sharding=with_auto_sharding)

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline(
                runner='BundleBasedDirectRunner',
                options=StandardOptions(streaming=is_streaming)) as p:
            if is_streaming:
                _SIZE = len(_ELEMENTS)
                fisrt_batch = [
                    TimestampedValue(value, start_time + i + 1)
                    for i, value in enumerate(_ELEMENTS[:_SIZE // 2])
                ]
                second_batch = [
                    TimestampedValue(value, start_time + _SIZE // 2 + i + 1)
                    for i, value in enumerate(_ELEMENTS[_SIZE // 2:])
                ]
                # Advance processing time between batches of input elements to fire the
                # user triggers. Intentionally advance the processing time twice for the
                # auto-sharding case since we need to first fire the timer and then
                # fire the trigger.
                test_stream = (
                    TestStream().advance_watermark_to(start_time).add_elements(
                        fisrt_batch).advance_processing_time(30).
                    advance_processing_time(30).add_elements(second_batch).
                    advance_processing_time(30).advance_processing_time(
                        30).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)
            outputs = input | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())
            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            # Check that all files exist.
            _ = (files
                 | beam.Map(
                     lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # Expect two load jobs are generated in the streaming case due to the
            # triggering frequency. Grouping is per trigger so we expect two entries
            # in the output as opposed to one.
            file_count = files | combiners.Count.Globally().without_defaults()
            expected_file_count = [1, 1] if is_streaming else [1]
            expected_destinations = [destination, destination
                                     ] if is_streaming else [destination]
            expected_jobs = [job_reference, job_reference
                             ] if is_streaming else [job_reference]
            assert_that(file_count,
                        equal_to(expected_file_count),
                        label='CountFiles')
            assert_that(destinations,
                        equal_to(expected_destinations),
                        label='CheckDestinations')
            assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')
コード例 #24
0
    def test_parse_csv_lines(self,
                             input_lines,
                             column_names,
                             expected_csv_cells,
                             expected_types,
                             expected_record_batch,
                             skip_blank_lines=False,
                             schema=None,
                             delimiter=',',
                             multivalent_columns=None,
                             secondary_delimiter=None,
                             raw_record_column_name=None):
        def _check_csv_cells(actual):
            for i in range(len(actual)):
                self.assertEqual(expected_csv_cells[i], actual[i][0])
                self.assertEqual(input_lines[i], actual[i][1])

        def _check_types(actual):
            self.assertLen(actual, 1)
            self.assertCountEqual([
                csv_decoder.ColumnInfo(n, t)
                for n, t in zip(column_names, expected_types)
            ], actual[0])

        def _check_record_batches(actual):
            """Compares a list of pa.RecordBatch."""
            if actual:
                self.assertTrue(actual[0].equals(expected_record_batch))
            else:
                self.assertEqual(expected_record_batch, actual)

        def _check_arrow_schema(actual):
            for record_batch in actual:
                expected_arrow_schema = csv_decoder.GetArrowSchema(
                    column_names, schema, raw_record_column_name)
                self.assertEqual(record_batch.schema, expected_arrow_schema)

        with beam.Pipeline() as p:
            parsed_csv_cells_and_raw_records = (
                p | beam.Create(input_lines, reshuffle=False)
                | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=delimiter)))
            inferred_types = (
                parsed_csv_cells_and_raw_records
                | beam.Keys()
                | beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(
                        column_names,
                        skip_blank_lines=skip_blank_lines,
                        multivalent_columns=multivalent_columns,
                        secondary_delimiter=secondary_delimiter)))

            beam_test_util.assert_that(parsed_csv_cells_and_raw_records,
                                       _check_csv_cells,
                                       label='check_parsed_csv_cells')
            beam_test_util.assert_that(inferred_types,
                                       _check_types,
                                       label='check_types')

            record_batches = (
                parsed_csv_cells_and_raw_records
                | beam.BatchElements(min_batch_size=1000) | beam.ParDo(
                    csv_decoder.BatchedCSVRowsToRecordBatch(
                        skip_blank_lines=skip_blank_lines,
                        multivalent_columns=multivalent_columns,
                        secondary_delimiter=secondary_delimiter,
                        raw_record_column_name=raw_record_column_name),
                    beam.pvalue.AsSingleton(inferred_types)))
            beam_test_util.assert_that(record_batches,
                                       _check_record_batches,
                                       label='check_record_batches')
            if schema:
                beam_test_util.assert_that(record_batches,
                                           _check_arrow_schema,
                                           label='check_arrow_schema')

        # Testing CSVToRecordBatch
        with beam.Pipeline() as p:
            record_batches = (
                p
                | 'CreatingPColl' >> beam.Create(input_lines, reshuffle=False)
                | 'CSVToRecordBatch' >> csv_decoder.CSVToRecordBatch(
                    column_names=column_names,
                    delimiter=delimiter,
                    skip_blank_lines=skip_blank_lines,
                    desired_batch_size=1000,
                    schema=schema,
                    multivalent_columns=multivalent_columns,
                    secondary_delimiter=secondary_delimiter,
                    raw_record_column_name=raw_record_column_name))
            beam_test_util.assert_that(record_batches,
                                       _check_record_batches,
                                       label='check_record_batches')
コード例 #25
0
def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    compute_confidence_intervals: Optional[bool] = False,
    random_seed_for_testing: Optional[int] = None
) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]:
  """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    compute_confidence_intervals: Set to True to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    Tuple of Tuple[PCollection of (slice key, metrics),
    PCollection of (slice key, plot metrics)] and
    PCollection of (slice_key and its example count).
  """
  # pylint: disable=no-value-for-parameter

  _ = (
      extracts.pipeline
      | counter_util.IncrementMetricsComputationCounters(
          eval_shared_model.add_metrics_callbacks))

  slices = (
      extracts
      # Downstream computation only cares about FPLs, so we prune before fanout.
      # Note that fanout itself will prune the slice keys.
      # TODO(b/130032676, b/111353165): Prune FPLs to contain only the necessary
      # set for the calculation of post_export_metrics if possible.
      | 'PruneExtracts' >> extractor.Filter(include=[
          constants.FEATURES_PREDICTIONS_LABELS_KEY,
          constants.SLICE_KEY_TYPES_KEY,
          constants.INPUT_KEY,
      ])
      # Input: one example at a time, with slice keys in extracts.
      # Output: one fpl example per slice key (notice that the example turns
      #         into n logical examples, references to which are replicated once
      #         per applicable slice key).
      | 'FanoutSlices' >> slicer.FanoutSlices())

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  aggregated_metrics = (
      slices
      # Metrics are computed per slice key.
      # Output: Multi-outputs, a dict of slice key to computed metrics, and
      # plots if applicable.
      | 'ComputePerSliceMetrics' >>
      poisson_bootstrap.ComputeWithConfidenceIntervals(
          aggregate.ComputePerSliceMetrics,
          num_bootstrap_samples=(poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES
                                 if compute_confidence_intervals else 1),
          random_seed_for_testing=random_seed_for_testing,
          eval_shared_model=eval_shared_model,
          desired_batch_size=desired_batch_size)
      | 'SeparateMetricsAndPlots' >> beam.ParDo(
          _SeparateMetricsAndPlotsFn()).with_outputs(
              _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS,
              main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS))

  return (aggregated_metrics, slices_count)
コード例 #26
0
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config_pb2.EvalConfig,
    metrics_specs: List[config_pb2.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY,
    attributions_key: Text = constants.ATTRIBUTIONS_KEY,
    schema: Optional[schema_pb2.Schema] = None,
    random_seed_for_testing: Optional[int] = None,
    tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None
) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    attributions_key: Name to use for attributions key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.
    tensor_adapter_config: Tensor adapter config which specifies how to obtain
      tensors from the Arrow RecordBatch. The model's signature will be invoked
      with those tensors (matched by names). If None, an attempt will be made to
      create an adapter based on the model's input signature otherwise the model
      will be invoked with raw examples (assuming a  signature of a single 1-D
      string tensor).

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'),
    plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions')
    depending on what the results_dict contains.
  """
  computations = []
  # Add default metric computations
  if eval_shared_models:
    # Note that there is the possibility for metric naming collisions here
    # (e.g. 'auc' calculated within the model as well as by AUC metric
    # computation performed outside the model). Currently all the overlapping
    # metrics such as AUC that are computed outside the model are all derived
    # metrics so they will override the metrics calculated by the model which is
    # the desired behavior.
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      if eval_shared_model.model_type == constants.TF_KERAS:
        computations.extend(
            keras_util.metric_computations_using_keras_saved_model(
                model_name, eval_shared_model.model_loader, eval_config,
                tensor_adapter_config))
      elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and
            eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags):
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, eval_shared_model.model_loader))
  # Add metric computations from specs
  metric_computations = _filter_and_separate_computations(
      metric_specs.to_computations(
          metrics_specs, eval_config=eval_config, schema=schema))
  computations.extend(metric_computations.non_derived_computations)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  model_types = _get_model_types_for_logging(eval_shared_models)

  _ = (
      extracts.pipeline
      | 'IncrementMetricsSpecsCounters' >>
      counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types),
      slices_count
      |
      'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters())

  ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

  cross_slice_specs = []
  if eval_config.cross_slicing_specs:
    cross_slice_specs = eval_config.cross_slicing_specs

  computations_combine_fn = _ComputationsCombineFn(computations=computations)
  derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics(
      metric_computations.derived_computations,
      metric_computations.cross_slice_computations, cross_slice_specs,
      baseline_model_name)

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots/attributions).
  #         The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the
  #         values will be the result of the associated computations. A given
  #         MetricComputation can perform computations for multiple keys, but
  #         the keys should be unique across computations.
  if ci_params.num_bootstrap_samples:
    sliced_metrics_plots_and_attributions = (
        slices | 'PoissonBootstrapConfidenceIntervals' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  elif ci_params.num_jackknife_samples:
    sliced_metrics_plots_and_attributions = (
        slices
        | 'JackknifeConfidenceIntervals' >>
        jackknife.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  else:
    sliced_metrics_plots_and_attributions = (
        slices
        |
        'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn)
        .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)
        | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform)

  sliced_metrics_plots_and_attributions = (
      sliced_metrics_plots_and_attributions
      | 'AddCIDerivedMetrics' >> beam.Map(
          _add_ci_derived_metrics, metric_computations.ci_derived_computations))

  if eval_config.options.min_slice_size.value > 1:
    sliced_metrics_plots_and_attributions = (
        sliced_metrics_plots_and_attributions
        | 'FilterSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.min_slice_size.value))

  sliced_metrics = (
      sliced_metrics_plots_and_attributions
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_plots_and_attributions
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  sliced_attributions = (
      sliced_metrics_plots_and_attributions
      | 'FilterByAttributions' >> beam.Map(_filter_by_key_type,
                                           metric_types.AttributionsKey))

  # pylint: enable=no-value-for-parameter

  return {
      metrics_key: sliced_metrics,
      plots_key: sliced_plots,
      attributions_key: sliced_attributions
  }
コード例 #28
0
ファイル: DataflowRunnerBQ.py プロジェクト: nikie/GCP
    '--staging_location=gs://your-bucket-name/staging',
    #4. Google Cloud Storage path is required for temporary files
    '--temp_location=gs://your-bucket-name/temp',
    #5. (Optional) Job name to be displayed in the logs
    '--job_name=word-count-bq-job'
]
pipeline_options = PipelineOptions(pipeline_args)
pipeline = beam.Pipeline(options=pipeline_options)

# Run Dataflow pipeline
read = pipeline | 'read file' >> beam.io.ReadFromText(
    'gs://dataflow-samples/shakespeare/kinglear.txt')
get_words = read | 'get words' >> beam.FlatMap(
    lambda x: re.findall(r'\w+', x)).with_output_types(unicode)
count_words = get_words | 'count words' >> beam.combiners.Count.PerElement()
word = count_words.apply(beam.Keys('get keys'))
count = count_words.apply(beam.Values('get values'))
result = count_words | 'to dict' >> beam.Map(lambda (word, count): {
    'word': word,
    'count': count
})
save = result | 'save' >> beam.io.Write(sink)
pipeline.run().wait_until_finish()

# Query the table head sorted by count
query = """
        SELECT word, count
        FROM `wordcount_dataset.wordcount_table`
        ORDER BY count ASC
        LIMIT 10;
        """
コード例 #29
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        finished_temp_tables_load_jobs_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(temp_tables_load_job_ids_pc)))

        schema_mod_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                UpdateDestinationSchema(
                    write_disposition=self.write_disposition,
                    test_client=self.test_client,
                    additional_bq_parameters=self.additional_bq_parameters,
                    step_name=step_name), schema_mod_job_name_pcv))

        finished_schema_mod_jobs_pc = (
            p
            | "ImpulseMonitorSchemaModJobs" >> beam.Create([None])
            | "WaitForSchemaModJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(schema_mod_job_ids_pc)))

        destination_copy_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client,
                                step_name=step_name), copy_job_name_pcv,
                pvalue.AsIter(finished_schema_mod_jobs_pc)))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            p
            | "RemoveTempTables/Impulse" >> beam.Create([None])
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda _, unused_copy_jobs, deleting_tables: deleting_tables,
                pvalue.AsIter(finished_copy_jobs_pc),
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Keys()
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs))

        _ = (p
             | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None])
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
コード例 #30
0
    def expand(self, pcoll):
        """Computes top-k most frequent values and number of uniques."""
        # Convert input example to tuples of form
        # (slice_key, feature_name, feature_value_list, optional weight)
        # corresponding to each example.
        feature_values_with_weights = (
            pcoll
            | 'TopKUniques_ConvertInputToFeatureValuesWithWeights' >>
            beam.FlatMap(_convert_input_to_feature_values_with_weights,
                         categorical_features=self._categorical_features,
                         weight_feature=self._weight_feature))

        # Lambda to convert from ((slice_key, feature_name, feature_value), count)
        # to ((slice_key, feature_name), (feature_value, count))
        modify_key = (lambda x:
                      ((x[0][0], x[0][1]), FeatureValueCount(x[0][2], x[1])))

        # Key to order values.
        key_fn = lambda x: (x.count, x.feature_value)

        sliced_feature_name_value_count = (
            feature_values_with_weights
            # Flatten (slice_key, feature_name, feature_value_list, optional weight)
            # to (slice_key, feature_name, feature_value)
            | 'TopKUniques_FlattenToSlicedFeatureNameValueTuples' >>
            beam.FlatMap(_flatten_value_list)
            # Compute the frequency of each feature_value per slice. Output is a
            # PCollection of ((slice_key, feature_name, feature_value), count)
            | 'TopKUniques_CountSlicedFeatureNameValueTuple' >>
            beam.combiners.Count().PerElement()
            # Convert from ((slice_key, feature_name, feature_value), count) to
            # ((slice_key, feature_name), (feature_value, count))
            |
            'TopKUniques_ModifyKeyToSlicedFeatureName' >> beam.Map(modify_key))

        result_protos = []
        # Find topk values for each feature.
        topk = (
            sliced_feature_name_value_count
            # Obtain the top-k most frequent feature value for each feature in a
            # slice.
            | 'TopK_GetTopK' >> beam.combiners.Top.PerKey(max(
                self._num_top_values, self._num_rank_histogram_buckets),
                                                          key=key_fn)
            | 'TopK_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                categorical_features=self._categorical_features,
                is_weighted_stats=False,
                num_top_values=self._num_top_values,
                frequency_threshold=self._frequency_threshold,
                num_rank_histogram_buckets=self._num_rank_histogram_buckets))

        result_protos.append(topk)

        # If a weight feature is provided, find the weighted topk values for each
        # feature.
        if self._weight_feature is not None:
            weighted_topk = (
                # Flatten (slice_key, feature_name, feature_value_list, weight) to
                # ((slice_key, feature_name, feature_value), weight)
                feature_values_with_weights
                | 'TopKWeighted_FlattenToSlicedFeatureNameValueTuples' >>
                beam.FlatMap(_flatten_weighted_value_list)
                # Sum the weights of each feature_value per slice. Output is a
                # PCollection of
                # ((slice_key, feature_name, feature_value), weighted_count)
                | 'TopKWeighted_CountSlicedFeatureNameValueTuple' >>
                beam.CombinePerKey(sum)
                # Convert from
                # ((slice_key, feature_name, feature_value), weighted_count) to
                # ((slice_key, feature_name), (feature_value, weighted_count))
                | 'TopKWeighted_ModifyKeyToSlicedFeatureName' >>
                beam.Map(modify_key)
                # Obtain the top-k most frequent feature value for each feature in a
                # slice.
                | 'TopKWeighted_GetTopK' >> beam.combiners.Top().PerKey(
                    max(self._num_top_values,
                        self._num_rank_histogram_buckets),
                    key=key_fn)
                | 'TopKWeighted_ConvertToSingleFeatureStats' >> beam.Map(
                    _make_dataset_feature_stats_proto_with_topk_for_single_feature,
                    categorical_features=self._categorical_features,
                    is_weighted_stats=True,
                    num_top_values=self._num_top_values,
                    frequency_threshold=self._weighted_frequency_threshold,
                    num_rank_histogram_buckets=self._num_rank_histogram_buckets
                ))
            result_protos.append(weighted_topk)

        uniques = (
            sliced_feature_name_value_count
            # Drop the values to only have the slice_key and feature_name with
            # each repeated the number of unique values times.
            | 'Uniques_DropValues' >> beam.Keys()
            | 'Uniques_CountPerFeatureName' >>
            beam.combiners.Count().PerElement()
            | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map(
                _make_dataset_feature_stats_proto_with_uniques_for_single_feature,
                categorical_features=self._categorical_features))
        result_protos.append(uniques)

        def _deserialize_sliced_feature_stats_proto(entry):
            feature_stats_proto = statistics_pb2.DatasetFeatureStatistics()
            feature_stats_proto.ParseFromString(entry[1])
            return entry[0], feature_stats_proto

        return (
            result_protos
            | 'FlattenTopKUniquesResults' >> beam.Flatten()
            # TODO(b/121152126): This deserialization stage is a workaround.
            # Remove this once it is no longer needed.
            | 'DeserializeTopKUniquesFeatureStatsProto' >>
            beam.Map(_deserialize_sliced_feature_stats_proto))