예제 #1
0
  def test_streaming_wordcount(self):
    class WordExtractingDoFn(beam.DoFn):
      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    # Add the TestStream so that it can be cached.
    ib.options.capturable_sources.add(TestStream)

    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(),
        options=StandardOptions(streaming=True))

    data = (
        p
        | TestStream()
            .advance_watermark_to(0)
            .advance_processing_time(1)
            .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
            .advance_watermark_to(20)
            .advance_processing_time(1)
            .add_elements(['that', 'is', 'the', 'question'])
        | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

    counts = (
        data
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    # Watch the local scope for Interactive Beam so that referenced PCollections
    # will be cached.
    ib.watch(locals())

    # This is normally done in the interactive_utils when a transform is
    # applied but needs an IPython environment. So we manually run this here.
    ie.current_env().track_user_pipelines()

    # Create a fake limiter that cancels the BCJ once the main job receives the
    # expected amount of results.
    class FakeLimiter:
      def __init__(self, p, pcoll):
        self.p = p
        self.pcoll = pcoll

      def is_triggered(self):
        result = ie.current_env().pipeline_result(self.p)
        if result:
          try:
            results = result.get(self.pcoll)
          except ValueError:
            return False
          return len(results) >= 10
        return False

    # This sets the limiters to stop reading when the test receives 10 elements.
    ie.current_env().options.capture_control.set_limiters_for_test(
        [FakeLimiter(p, data)])

    # This tests that the data was correctly cached.
    pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
    expected_data_df = pd.DataFrame([
        ('to', 0, [IntervalWindow(0, 10)], pane_info),
        ('be', 0, [IntervalWindow(0, 10)], pane_info),
        ('or', 0, [IntervalWindow(0, 10)], pane_info),
        ('not', 0, [IntervalWindow(0, 10)], pane_info),
        ('to', 0, [IntervalWindow(0, 10)], pane_info),
        ('be', 0, [IntervalWindow(0, 10)], pane_info),
        ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
    ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

    data_df = ib.collect(data, include_window_info=True)
    pd.testing.assert_frame_equal(expected_data_df, data_df)

    # This tests that the windowing was passed correctly so that all the data
    # is aggregated also correctly.
    pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
    expected_counts_df = pd.DataFrame([
        ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
    ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

    counts_df = ib.collect(counts, include_window_info=True)

    # The group by key has no guarantee of order. So we post-process the DF by
    # sorting so we can test equality.
    sorted_counts_df = (counts_df
                        .sort_values(['event_time', 0], ascending=True)
                        .reset_index(drop=True)) # yapf: disable
    pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
예제 #2
0
    def assertAnalyzeAndTransformResults(self,
                                         input_data,
                                         input_metadata,
                                         preprocessing_fn,
                                         expected_data=None,
                                         expected_metadata=None,
                                         expected_vocab_file_contents=None,
                                         expected_asset_file_contents=None,
                                         test_data=None,
                                         desired_batch_size=None,
                                         beam_pipeline=None,
                                         temp_dir=None):
        """Assert that input data and metadata is transformed as expected.

    This methods asserts transformed data and transformed metadata match
    with expected_data and expected_metadata.

    Args:
      input_data: A sequence of dicts whose values are
          either strings, lists of strings, numeric types or a pair of those.
      input_metadata: DatasetMetadata describing input_data.
      preprocessing_fn: A function taking a dict of tensors and returning
          a dict of tensors.
      expected_data: (optional) A dataset with the same type constraints as
          input_data, but representing the output after transformation.
          If supplied, transformed data is asserted to be equal.
      expected_metadata: (optional) DatasetMetadata describing the transformed
          data. If supplied, transformed metadata is asserted to be equal.
      expected_vocab_file_contents: (optional) A dictionary from vocab filenames
          to their expected content as a list of text lines or a list of tuples
          of frequency and text. Values should be the expected result of calling
          f.readlines() on the given asset files.
      expected_asset_file_contents: deprecated.  Use
          expected_vocab_file_contents.
      test_data: (optional) If this is provided then instead of calling
          AnalyzeAndTransformDataset with input_data, this function will call
          AnalyzeDataset with input_data and TransformDataset with test_data.
          Note that this is the case even if input_data and test_data are equal.
          test_data should also conform to input_metadata.
      desired_batch_size: (optional) A batch size to batch elements by. If not
          provided, a batch size will be computed automatically.
      beam_pipeline: (optional) A Beam Pipeline to use in this test.
      temp_dir: If set, it is used as output directory, else a new unique
          directory is created.
    Raises:
      AssertionError: if the expected data does not match the results of
          transforming input_data according to preprocessing_fn, or
          (if provided) if the expected metadata does not match.
      ValueError: if expected_vocab_file_contents and
          expected_asset_file_contents are both set.
    """
        if (expected_vocab_file_contents is not None
                and expected_asset_file_contents is not None):
            raise ValueError('only one of expected_asset_file_contents and '
                             'expected_asset_file_contents should be set')
        elif expected_asset_file_contents is not None:
            tf.compat.v1.logging.warn(
                'expected_asset_file_contents is deprecated, use '
                'expected_vocab_file_contents')

        expected_vocab_file_contents = (expected_vocab_file_contents
                                        or expected_asset_file_contents or {})
        del expected_asset_file_contents

        # Note: we don't separately test AnalyzeDataset and TransformDataset as
        # AnalyzeAndTransformDataset currently simply composes these two
        # transforms.  If in future versions of the code, the implementation
        # differs, we should also run AnalyzeDataset and TransformDataset composed.
        temp_dir = temp_dir or tempfile.mkdtemp(prefix=self._testMethodName,
                                                dir=self.get_temp_dir())
        with beam_pipeline or self._makeTestPipeline() as pipeline:
            with beam_impl.Context(temp_dir=temp_dir,
                                   desired_batch_size=desired_batch_size):
                input_data = pipeline | 'CreateInput' >> beam.Create(
                    input_data)
                if test_data is None:
                    (transformed_data, transformed_metadata), transform_fn = (
                        (input_data, input_metadata)
                        |
                        beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))
                else:
                    transform_fn = ((input_data, input_metadata)
                                    |
                                    beam_impl.AnalyzeDataset(preprocessing_fn))
                    test_data = pipeline | 'CreateTest' >> beam.Create(
                        test_data)
                    transformed_data, transformed_metadata = (
                        ((test_data, input_metadata), transform_fn)
                        | beam_impl.TransformDataset())

                # Write transform_fn so we can test its assets
                _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

                if expected_data is not None:
                    transformed_data_coder = tft.coders.ExampleProtoCoder(
                        transformed_metadata.schema)

                    transformed_data_path = os.path.join(
                        temp_dir, 'transformed_data')
                    _ = (transformed_data
                         | beam.Map(transformed_data_coder.encode)
                         | beam.io.tfrecordio.WriteToTFRecord(
                             transformed_data_path, shard_name_template=''))

        # TODO(ebreck) Log transformed_data somewhere.
        if expected_data is not None:
            examples = tf.compat.v1.python_io.tf_record_iterator(
                path=transformed_data_path)
            transformed_data = [
                transformed_data_coder.decode(x) for x in examples
            ]
            self.assertDataCloseOrEqual(expected_data, transformed_data)

        tf_transform_output = tft.TFTransformOutput(temp_dir)
        if expected_metadata:
            # Make a copy with no annotations.
            transformed_schema = schema_pb2.Schema()
            transformed_schema.CopyFrom(
                tf_transform_output.transformed_metadata.schema)
            for feature in transformed_schema.feature:
                feature.ClearField('annotation')
            self.assertEqual(expected_metadata.schema, transformed_schema)

        for filename, file_contents in six.iteritems(
                expected_vocab_file_contents):
            full_filename = tf_transform_output.vocabulary_file_by_name(
                filename)
            self.AssertVocabularyContents(full_filename, file_contents)
예제 #3
0
def InputsToExtracts(  # pylint: disable=invalid-name
    inputs: beam.pvalue.PCollection):
  """Converts serialized inputs (e.g. examples) to Extracts."""
  return (inputs
          | 'AddInputKey' >> beam.Map(lambda x: {constants.INPUT_KEY: x}))
                 file_name_suffix='.tfrecord.gz'))

    _ = train_data | 'TransformAndWriteTraining' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        'features_train')

    _ = eval_data | 'TransformAndWriteEval' >> TransformAndWrite(  # pylint: disable=no-value-for-parameter
        'features_eval')

    # TODO(b/35300113) Remember to eventually also save the statistics.

    # Save files for online and batch prediction.
    prediction_schema = movielens.make_prediction_schema()
    prediction_coder = tft_coders.ExampleProtoCoder(prediction_schema)
    prediction_data = (eval_data
                       |
                       'EncodePrediction' >> beam.Map(prediction_coder.encode))
    _ = (prediction_data
         | 'EncodePredictionAsB64Json' >> beam.Map(_encode_as_b64_json)
         | 'WritePredictDataAsText' >> beam.io.WriteToText(
             os.path.join(args.output_dir, 'features_predict'),
             file_name_suffix='.txt'))
    _ = (prediction_data
         | 'WritePredictDataAsTfRecord' >> beam.io.WriteToTFRecord(
             os.path.join(args.output_dir, 'features_predict'),
             file_name_suffix='.tfrecord.gz'))


def _encode_as_b64_json(serialized_example):
    import base64  # pylint: disable=g-import-not-at-top
    import json  # pylint: disable=g-import-not-at-top
    return json.dumps({'b64': base64.b64encode(serialized_example)})
예제 #5
0
        def test_metrics(self):
            """Run a simple DoFn that increments a counter and verifies state
      caching metrics. Verifies that its expected value is written to a
      temporary file by the FileReporter"""

            counter_name = 'elem_counter'
            state_spec = userstate.BagStateSpec('state', VarIntCoder())

            class DoFn(beam.DoFn):
                def __init__(self):
                    self.counter = Metrics.counter(self.__class__,
                                                   counter_name)
                    logging.info('counter: %s' % self.counter.metric_name)

                def process(self, kv, state=beam.DoFn.StateParam(state_spec)):
                    # Trigger materialization
                    list(state.read())
                    state.add(1)
                    self.counter.inc()

            options = self.create_options()
            # Test only supports parallelism of 1
            options._all_options['parallelism'] = 1
            # Create multiple bundles to test cache metrics
            options._all_options['max_bundle_size'] = 10
            options._all_options['max_bundle_time_millis'] = 95130590130
            experiments = options.view_as(DebugOptions).experiments or []
            experiments.append('state_cache_size=123')
            options.view_as(DebugOptions).experiments = experiments
            with Pipeline(self.get_runner(), options) as p:
                # pylint: disable=expression-not-assigned
                (p
                 | "create" >> beam.Create(list(range(0, 110)))
                 | "mapper" >> beam.Map(lambda x: (x % 10, 'val'))
                 | "stateful" >> beam.ParDo(DoFn()))

            lines_expected = {'counter: 110'}
            if streaming:
                lines_expected.update([
                    # Gauges for the last finished bundle
                    'stateful.beam.metric:statecache:capacity: 123',
                    # These are off by 10 because the first bundle contains all the keys
                    # once. Caching is only initialized after the first bundle. Caching
                    # depends on the cache token which is lazily initialized by the
                    # Runner's StateRequestHandlers.
                    'stateful.beam.metric:statecache:size: 10',
                    'stateful.beam.metric:statecache:get: 10',
                    'stateful.beam.metric:statecache:miss: 0',
                    'stateful.beam.metric:statecache:hit: 10',
                    'stateful.beam.metric:statecache:put: 0',
                    'stateful.beam.metric:statecache:extend: 10',
                    'stateful.beam.metric:statecache:evict: 0',
                    # Counters
                    # (total of get/hit will be off by 10 due to the caching
                    # only getting initialized after the first bundle.
                    # Caching depends on the cache token which is lazily
                    # initialized by the Runner's StateRequestHandlers).
                    'stateful.beam.metric:statecache:get_total: 100',
                    'stateful.beam.metric:statecache:miss_total: 10',
                    'stateful.beam.metric:statecache:hit_total: 90',
                    'stateful.beam.metric:statecache:put_total: 10',
                    'stateful.beam.metric:statecache:extend_total: 100',
                    'stateful.beam.metric:statecache:evict_total: 0',
                ])
            else:
                # Batch has a different processing model. All values for
                # a key are processed at once.
                lines_expected.update([
                    # Gauges
                    'stateful).beam.metric:statecache:capacity: 123',
                    # For the first key, the cache token will not be set yet.
                    # It's lazily initialized after first access in StateRequestHandlers
                    'stateful).beam.metric:statecache:size: 9',
                    # We have 11 here because there are 110 / 10 elements per key
                    'stateful).beam.metric:statecache:get: 11',
                    'stateful).beam.metric:statecache:miss: 1',
                    'stateful).beam.metric:statecache:hit: 10',
                    # State is flushed back once per key
                    'stateful).beam.metric:statecache:put: 1',
                    'stateful).beam.metric:statecache:extend: 1',
                    'stateful).beam.metric:statecache:evict: 0',
                    # Counters
                    'stateful).beam.metric:statecache:get_total: 99',
                    'stateful).beam.metric:statecache:miss_total: 9',
                    'stateful).beam.metric:statecache:hit_total: 90',
                    'stateful).beam.metric:statecache:put_total: 9',
                    'stateful).beam.metric:statecache:extend_total: 9',
                    'stateful).beam.metric:statecache:evict_total: 0',
                ])
            lines_actual = set()
            with open(self.test_metrics_path, 'r') as f:
                line = f.readline()
                while line:
                    for metric_str in lines_expected:
                        if metric_str in line:
                            lines_actual.add(metric_str)
                    line = f.readline()
            self.assertSetEqual(lines_actual, lines_expected)
    def testMultiClassMetrics(self, metric_name, expected_value):
        computations = tf_metric_wrapper.tf_metric_computations(
            [self._tf_metric_by_name(metric_name)], config.EvalConfig())
        histogram = computations[0]
        matrix = computations[1]
        metric = computations[2]

        example1 = {
            'labels': np.array([2]),
            'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]),
            'example_weights': np.array([0.5]),
        }
        example2 = {
            'labels': np.array([1]),
            'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]),
            'example_weights': np.array([0.7]),
        }
        example3 = {
            'labels': np.array([3]),
            'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]),
            'example_weights': np.array([0.9]),
        }
        example4 = {
            'labels': np.array([4]),
            'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]),
            'example_weights': np.array([0.3]),
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeConfusionMatrix' >> beam.Map(
                    lambda x: (x[0], matrix.result(x[1])))  # pyformat: disable
                | 'ComputeMetric' >> beam.Map(lambda x:
                                              (x[0], metric.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    top_k = int(metric_name.split('@')[1])
                    key = metric_types.MetricKey(
                        name=metric_name,
                        sub_key=metric_types.SubKey(top_k=top_k))
                    self.assertDictElementsAlmostEqual(got_metrics,
                                                       {key: expected_value},
                                                       places=5)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testWithDefaultMetricsProvidedByModel(self):
        export_dir = os.path.join(self._getTempDir(), 'export_dir')
        dummy_layer = tf.keras.layers.Input(shape=(1, ))
        model = tf.keras.models.Model([dummy_layer], [dummy_layer])
        model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=[tf.keras.metrics.MeanSquaredError(name='mse')])
        model.save(export_dir, save_format='tf')
        model_loader = types.ModelLoader(
            tags=[tf.saved_model.SERVING],
            construct_fn=model_util.model_construct_fn(
                eval_saved_model_path=export_dir,
                tags=[tf.saved_model.SERVING]))

        computations = tf_metric_wrapper.tf_metric_computations(
            [tf.keras.metrics.AUC(name='auc')],
            config.EvalConfig(),
            model_loader=model_loader)

        confusion_histogram = computations[0]
        confusion_matrix = computations[1].result
        confusion_metrics = computations[2].result
        non_confusion_metrics = computations[3]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.0]),
            'example_weights': np.array([1.0]),
        }
        example2 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([1.0]),
        }
        example3 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.3]),
            'example_weights': np.array([1.0]),
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([1.0]),
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            sliced_examples = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x)))

            confusion_result = (
                sliced_examples
                | 'ComputeHistogram' >> beam.CombinePerKey(
                    confusion_histogram.combiner)
                | 'ComputeConfusionMatrix' >>
                beam.Map(lambda x:
                         (x[0], confusion_matrix(x[1])))  # pyformat: disable
                | 'ComputeMetric' >> beam.Map(lambda x:
                                              (x[0], confusion_metrics(x[1])))
            )  # pyformat: disable

            non_confusion_result = (sliced_examples
                                    | 'Combine' >> beam.CombinePerKey(
                                        non_confusion_metrics.combiner))

            # pylint: enable=no-value-for-parameter

            def check_confusion_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    auc_key = metric_types.MetricKey(name='auc')
                    self.assertDictElementsAlmostEqual(got_metrics,
                                                       {auc_key: 0.75},
                                                       places=5)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            def check_non_confusion_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    mse_key = metric_types.MetricKey(name='mse')
                    binary_crossentropy_key = metric_types.MetricKey(
                        name='binary_crossentropy')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            mse_key: 0.1875,
                            binary_crossentropy_key: 0.0
                        },
                        places=5)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(confusion_result,
                             check_confusion_result,
                             label='confusion')
            util.assert_that(non_confusion_result,
                             check_non_confusion_result,
                             label='non_confusion')
예제 #8
0
def shuffle(pcoll):  # pylint: disable=invalid-name
    return (pcoll
            | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
            | 'GroupByRandom' >> beam.GroupByKey()
            | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
예제 #9
0
def preprocess(pipeline, args):
    input_metadata = metadata_io.read_metadata(
        os.path.join(args.analyze_output_dir, RAW_METADATA_DIR))

    schema = json.loads(
        file_io.read_file_to_string(
            os.path.join(args.analyze_output_dir, SCHEMA_FILE)).decode())
    features = json.loads(
        file_io.read_file_to_string(
            os.path.join(args.analyze_output_dir, FEATURES_FILE)).decode())

    column_names = [col['name'] for col in schema]

    exclude_outputs = None
    if not args.target:
        for name, transform in six.iteritems(features):
            if transform['transform'] == TARGET_TRANSFORM:
                target_name = name
                column_names.remove(target_name)
                exclude_outputs = [target_name]
                del input_metadata.schema.column_schemas[target_name]
                break

    if args.csv_file_pattern:
        coder = coders.CsvCoder(column_names,
                                input_metadata.schema,
                                delimiter=',')
        raw_data = (
            pipeline
            | 'ReadCsvData' >> beam.io.ReadFromText(args.csv_file_pattern)
            | 'ParseCsvData' >> beam.Map(coder.decode))
    else:
        columns = ', '.join(column_names)
        query = 'SELECT {columns} FROM `{table}`'.format(
            columns=columns, table=args.bigquery_table)
        raw_data = (
            pipeline
            | 'ReadBiqQueryData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    # Note that prepare_image_transforms does not make embeddings, it justs reads
    # the image files and converts them to byte stings. tft.TransformDataset()
    # will apply the saved model that makes the image embeddings.
    image_columns = image_transform_columns(features)
    raw_data = (raw_data
                | 'PreprocessTransferredLearningTransformations' >> beam.Map(
                    prepare_image_transforms, image_columns))

    if args.shuffle:
        raw_data = raw_data | 'ShuffleData' >> shuffle()

    transform_fn = (pipeline
                    | 'ReadTransformFn' >> tft_beam_io.ReadTransformFn(
                        args.analyze_output_dir))

    (transformed_data,
     transform_metadata) = (((raw_data, input_metadata), transform_fn)
                            | 'ApplyTensorflowPreprocessingGraph' >>
                            tft.TransformDataset(exclude_outputs))

    tfexample_coder = coders.ExampleProtoCoder(transform_metadata.schema)
    _ = (transformed_data
         | 'SerializeExamples' >> beam.Map(tfexample_coder.encode)
         | 'WriteExamples' >> beam.io.WriteToTFRecord(
             os.path.join(args.output_dir, args.output_filename_prefix),
             file_name_suffix='.tfrecord.gz'))
예제 #10
0
 def expand(self, dataset):
     return (dataset
             | 'DetectAnomaliesInExamples' >> beam.Map(
                 _detect_anomalies_in_example, options=self.options)
             | 'GenerateAnomalyReasonKeys' >> beam.ParDo(
                 _GenerateAnomalyReasonSliceKeys()))
예제 #11
0
파일: executor.py 프로젝트: zilongqiu/zenml
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Write description regarding this beautiful executor.

        Args:
            input_dict:
            output_dict:
            exec_properties:
        """
        self._log_startup(input_dict, output_dict, exec_properties)

        schema = parse_schema(input_dict=input_dict)

        statistics = parse_statistics(
            split_name=DATA_SPLIT_NAME,
            statistics=input_dict[constants.STATISTICS])

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        # pass the schema and stats straight to the Step
        args[constants.SCHEMA] = schema
        args[constants.STATISTICS] = statistics

        c = source_utils.load_source_path_class(source)
        split_step: BaseSplit = c(**args)

        # infer the names of the splits from the config
        split_names = split_step.get_split_names()

        # Get output split path
        examples_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        if SKIP in split_names:
            sanitized_names = [name for name in split_names if name != SKIP]
            examples_artifact.split_names = artifact_utils.encode_split_names(
                sanitized_names)
        else:
            examples_artifact.split_names = artifact_utils.encode_split_names(
                split_names)

        split_uris = []
        for artifact in input_dict[constants.INPUT_EXAMPLES]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))

        with self._make_beam_pipeline() as p:
            # The outer loop will for now only run once
            for split, uri in split_uris:
                input_uri = io_utils.all_files_pattern(uri)

                new_splits = (
                        p
                        | 'ReadData.' + split >> beam.io.ReadFromTFRecord(
                    file_pattern=input_uri)
                        | beam.Map(tf.train.Example.FromString)
                        | 'Split' >> beam.Partition(
                    split_step.partition_fn()[0],
                    split_step.get_num_splits(),
                    **split_step.partition_fn()[1])
                )

                for split_name, new_split in zip(split_names,
                                                 list(new_splits)):
                    if split_name != SKIP:
                        # WriteSplit function writes to TFRecord again
                        (new_split
                         | 'Serialize.' + split_name >> beam.Map(
                                    lambda x: x.SerializeToString())
                         | 'WriteSplit_' + split_name >> WriteSplit(
                                    get_split_uri(
                                        output_dict[constants.OUTPUT_EXAMPLES],
                                        split_name)))
예제 #12
0
 def expand(self, pcoll):
     return pcoll | 'TestLabel' >> beam.Map(lambda x: 'Simple(%s)' % x)
예제 #13
0
 def expand(self, pcoll):
     return pcoll | beam.Map(lambda x, s: x + s, self._payload)
예제 #14
0
  def test_wordcount(self):
    class WordExtractingDoFn(beam.DoFn):
      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

    # Count the occurrences of each word.
    counts = (
        p
        | beam.Create(['to be or not to be that is the question'])
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    # Watch the local scope for Interactive Beam so that counts will be cached.
    ib.watch(locals())

    result = p.run()
    result.wait_until_finish()

    actual = list(result.get(counts))
    self.assertSetEqual(
        set(actual),
        set([
            ('or', 1),
            ('that', 1),
            ('be', 2),
            ('is', 1),
            ('question', 1),
            ('to', 2),
            ('the', 1),
            ('not', 1),
        ]))

    # Truncate the precision to millis because the window coder uses millis
    # as units then gets upcast to micros.
    end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
    df_counts = ib.collect(counts, include_window_info=True)
    df_expected = pd.DataFrame({
        0: [e[0] for e in actual],
        1: [e[1] for e in actual],
        'event_time': [end_of_window for _ in actual],
        'windows': [[GlobalWindow()] for _ in actual],
        'pane_info': [
            PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual
        ]
    },
                               columns=[
                                   0, 1, 'event_time', 'windows', 'pane_info'
                               ])

    pd.testing.assert_frame_equal(df_expected, df_counts)

    actual_reified = result.get(counts, include_window_info=True)
    expected_reified = [
        WindowedValue(
            e,
            Timestamp(micros=end_of_window), [GlobalWindow()],
            PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual
    ]
    self.assertEqual(actual_reified, expected_reified)
예제 #15
0
  def test_instrument_example_unbounded_pipeline_to_read_cache_not_cached(self):
    """Tests that the instrumenter works when the PCollection is not cached.
    """
    # Create a new interactive environment to make the test idempotent.
    ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None))

    # Create the pipeline that will be instrumented.
    from apache_beam.options.pipeline_options import StandardOptions
    options = StandardOptions(streaming=True)
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options)
    source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    # pylint: disable=possibly-unused-variable
    pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)

    # Watch but do not cache the PCollections.
    ib.watch(locals())

    # Instrument the original pipeline to create the pipeline the user will see.
    p_copy = beam.Pipeline.from_runner_api(
        p_original.to_runner_api(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)
    instrumenter = instr.build_pipeline_instrument(p_copy)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    source_1_cache_key = self.cache_key_of('source_1', source_1)
    p_expected = beam.Pipeline()
    test_stream = (
        p_expected
        | TestStream(output_tags=[self.cache_key_of('source_1', source_1)]))
    # pylint: disable=expression-not-assigned
    (
        test_stream
        | 'square1' >> beam.Map(lambda x: x * x)
        | 'reify' >> beam.Map(lambda _: _)
        | cache.WriteCache(ie.current_env().cache_manager(), 'unused'))

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([source_1_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(use_fake_coders=True),
        instrumenter.instrumented_pipeline_proto())
def run_pipeline(mae_input_pattern, mae_golden_dir, results_dir,
                 mae_input_query, mae_golden_table,
                 write_per_note_stats_to_gcs, results_table,
                 per_note_results_table, debug_output_table, types_to_ignore,
                 pipeline_args):
    """Evaluate the input files against the goldens."""
    if ((mae_input_pattern is None) == (mae_input_query is None)
            or (mae_golden_dir is None) == (mae_golden_table is None)
            or (mae_input_query is None) != (mae_golden_table is None)
            or (mae_input_pattern is None) != (mae_golden_dir is None)):
        return [
            'Must set exactly one of: '
            '(--mae_input_pattern AND --mae_golden_dir) '
            'OR (--mae_input_query AND --mae_golden_table).'
        ]

    if write_per_note_stats_to_gcs and not results_dir:
        return [
            'Must set --results_dir when --write_per_note_stats_to_gcs is set.'
        ]

    logging.info('Starting evaluation.')

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))

    if mae_input_pattern:
        filenames = []
        storage_client = storage.Client()
        for f in gcsutil.find_files(mae_input_pattern, storage_client):
            if posixpath.dirname(
                    f.string()) != posixpath.dirname(mae_input_pattern):
                # Ignore subdirectories.
                continue
            filenames.append(f)

    per_note_results = None
    if mae_input_query and mae_golden_table:
        query_template = (
            'SELECT findings.record_id, findings.xml, golden.xml '
            'FROM ({}) AS findings '
            'LEFT JOIN [{}] AS golden '
            'ON findings.record_id=golden.record_id')
        query = query_template.format(mae_input_query, mae_golden_table)
        per_note_results = (p
                            | beam.io.Read(beam.io.BigQuerySource(query=query))
                            | beam.Map(compare_bq_row, types_to_ignore))
    else:
        per_note_results = (p | beam.Create(filenames) | beam.Map(
            compare, mae_golden_dir, types_to_ignore))
    now = str(_get_utcnow())
    if debug_output_table:
        _ = (per_note_results | beam.FlatMap(
            format_debug_info, now
        ) | 'write_debug_info' >> beam.io.Write(
            beam.io.BigQuerySink(
                debug_output_table,
                schema=(
                    'record_id:STRING,classification:STRING,info_type:STRING,'
                    'text:STRING,context:STRING,start:INTEGER,end:INTEGER,'
                    'timestamp:TIMESTAMP'),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))

    if per_note_results_table:
        _ = (per_note_results | beam.Map(
            format_individual_result_for_bq, now
        ) | 'write_per_note' >> beam.io.Write(
            beam.io.BigQuerySink(
                per_note_results_table,
                schema=('record_id:STRING,' + BASE_SCHEMA),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))
    aggregate_results = (per_note_results
                         | beam.CombineGlobally(CombineResultsFn()))
    if results_dir:
        _ = (aggregate_results
             | beam.Map(write_aggregate_results_to_gcs, results_dir))
    if results_table:
        _ = (aggregate_results | beam.FlatMap(
            format_aggregate_results_for_bq, now
        ) | 'write_aggregate' >> beam.io.Write(
            beam.io.BigQuerySink(
                results_table,
                schema=('info_type:STRING,' + BASE_SCHEMA),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))

    if write_per_note_stats_to_gcs:
        _ = (per_note_results | beam.Map(get_binary_token_result)
             | beam.io.WriteToText(
                 posixpath.join(results_dir, 'per-note-results')))

    result = p.run().wait_until_finish()

    logging.info('Eval result: %s', result)
    return []
예제 #17
0
  def test_instrument_example_unbounded_pipeline_to_multiple_read_cache(self):
    """Tests that the instrumenter works for multiple unbounded sources.
    """
    # Create a new interactive environment to make the test idempotent.
    ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None))

    # Create the pipeline that will be instrumented.
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner())
    source_1 = p_original | 'source1' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    source_2 = p_original | 'source2' >> beam.io.ReadFromPubSub(
        subscription='projects/fake-project/subscriptions/fake_sub')
    # pylint: disable=possibly-unused-variable
    pcoll_1 = source_1 | 'square1' >> beam.Map(lambda x: x * x)
    # pylint: disable=possibly-unused-variable
    pcoll_2 = source_2 | 'square2' >> beam.Map(lambda x: x * x)

    # Mock as if cacheable PCollections are cached.
    ib.watch(locals())

    for name, pcoll in locals().items():
      if not isinstance(pcoll, beam.pvalue.PCollection):
        continue
      cache_key = self.cache_key_of(name, pcoll)
      self._mock_write_cache([b''], cache_key)

    # Instrument the original pipeline to create the pipeline the user will see.
    instrumenter = instr.build_pipeline_instrument(p_original)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=None)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    source_1_cache_key = self.cache_key_of('source_1', source_1)
    source_2_cache_key = self.cache_key_of('source_2', source_2)
    p_expected = beam.Pipeline()
    test_stream = (
        p_expected
        | TestStream(
            output_tags=[
                self.cache_key_of('source_1', source_1),
                self.cache_key_of('source_2', source_2)
            ]))
    # pylint: disable=expression-not-assigned
    test_stream[source_1_cache_key] | 'square1' >> beam.Map(lambda x: x * x)
    # pylint: disable=expression-not-assigned
    test_stream[source_2_cache_key] | 'square2' >> beam.Map(lambda x: x * x)

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([source_1_cache_key, source_2_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(),
        instrumenter.instrumented_pipeline_proto())
예제 #18
0
 def test_bad_main_input(self):
   @typehints.with_input_types(str, int)
   def repeat(s, times):
     return s * times
   with self.assertRaises(typehints.TypeCheckError):
     [1, 2, 3] | beam.Map(repeat, 3)
    def testBatching(self):
        computation = tf_metric_wrapper.tf_metric_computations(
            [_CustomMetric(),
             tf.keras.metrics.MeanSquaredError(name='mse')],
            config.EvalConfig(),
            batch_size=2)[0]

        example1 = {
            'labels': [0.0],
            'predictions': [0.0],
            'example_weights': [1.0]
        }
        example2 = {
            'labels': [0.0],
            'predictions': [0.5],
            'example_weights': [1.0]
        }
        example3 = {
            'labels': [1.0],
            'predictions': [0.3],
            'example_weights': [1.0]
        }
        example4 = {
            'labels': [1.0],
            'predictions': [0.9],
            'example_weights': [1.0]
        }
        example5 = {
            'labels': [1.0],
            'predictions': [0.5],
            'example_weights': [0.0]
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4, example5])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'Combine' >> beam.CombinePerKey(computation.combiner))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())

                    custom_key = metric_types.MetricKey(name='custom')
                    mse_key = metric_types.MetricKey(name='mse')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) /
                            (1.0 + 1.0 + 1.0 + 1.0 + 0.0),
                            mse_key:
                            0.1875,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
예제 #20
0
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream().advance_watermark_to(
            0, tag='letters').advance_watermark_to(
                0, tag='numbers').advance_watermark_to(
                    20, tag='numbers').advance_watermark_to(
                        5, tag='letters').add_elements(
                            letters_elements,
                            tag='letters').advance_watermark_to(
                                10, tag='letters').add_elements(
                                    numbers_elements,
                                    tag='numbers').advance_watermark_to(
                                        30, tag='numbers'))

        options = StandardOptions(streaming=True)
        p = TestPipeline(is_integration_test=True, options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
예제 #21
0
 def expand(self, pcoll):
     return (pcoll
             | beam.Map(lambda info: (info[self.field], info['score']))
             | beam.CombinePerKey(sum_ints))
예제 #22
0
  def expand(self, pcoll):
    p = pcoll.pipeline
    try:
      step_name = self.label
    except AttributeError:
      step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
      BigQueryBatchFileLoads.COUNT += 1

    temp_location = p.options.view_as(GoogleCloudOptions).temp_location
    job_name = (
        p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME')

    empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
    singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

    load_job_name_pcv = pvalue.AsSingleton(
        singleton_pc
        | "LoadJobNamePrefix" >> beam.Map(
            lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))

    schema_mod_job_name_pcv = pvalue.AsSingleton(
        singleton_pc
        | "SchemaModJobNamePrefix" >> beam.Map(
            lambda _: _generate_job_name(
                job_name,
                bigquery_tools.BigQueryJobTypes.LOAD,
                'SCHEMA_MOD_STEP')))

    copy_job_name_pcv = pvalue.AsSingleton(
        singleton_pc
        | "CopyJobNamePrefix" >> beam.Map(
            lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))

    file_prefix_pcv = pvalue.AsSingleton(
        singleton_pc
        | "GenerateFilePrefix" >> beam.Map(
            file_prefix_generator(
                self._validate, self._custom_gcs_temp_location, temp_location)))

    destination_data_kv_pc = (
        pcoll
        | "RewindowIntoGlobal" >> self._window_fn()
        | "AppendDestination" >> beam.ParDo(
            bigquery_tools.AppendDestinationsFn(self.destination),
            *self.table_side_inputs))

    if not self.with_auto_sharding:
      all_destination_file_pairs_pc = self._write_files(
          destination_data_kv_pc, file_prefix_pcv)
    else:
      all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
          destination_data_kv_pc, file_prefix_pcv)

    grouped_files_pc = (
        all_destination_file_pairs_pc
        | "GroupFilesByTableDestinations" >> beam.GroupByKey())

    partitions = (
        grouped_files_pc
        | beam.ParDo(
            PartitionFiles(
                self.max_partition_size,
                self.max_files_per_partition)).with_outputs(
                    PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                    PartitionFiles.SINGLE_PARTITION_TAG))

    multiple_partitions_per_destination_pc = partitions[
        PartitionFiles.MULTIPLE_PARTITIONS_TAG]
    single_partition_per_destination_pc = partitions[
        PartitionFiles.SINGLE_PARTITION_TAG]

    # When using dynamic destinations, elements with both single as well as
    # multiple partitions are loaded into BigQuery using temporary tables to
    # ensure atomicity.
    if self.dynamic_destinations:
      all_partitions = ((
          multiple_partitions_per_destination_pc,
          single_partition_per_destination_pc)
                        | "FlattenPartitions" >> beam.Flatten())
      destination_load_job_ids_pc, destination_copy_job_ids_pc = (
          self._load_data(all_partitions,
                          empty_pc,
                          load_job_name_pcv,
                          schema_mod_job_name_pcv,
                          copy_job_name_pcv,
                          p,
                          step_name))
    else:
      destination_load_job_ids_pc, destination_copy_job_ids_pc = (
          self._load_data(multiple_partitions_per_destination_pc,
                          single_partition_per_destination_pc,
                          load_job_name_pcv,
                          schema_mod_job_name_pcv,
                          copy_job_name_pcv,
                          p,
                          step_name))

    return {
        self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
        self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
        self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
    }
예제 #23
0
def generate_statistics_from_bq(
    query: Text,
    output_path: Text,
    schema: schema_pb2.Schema,
    stats_options: stats_options.StatsOptions = stats_options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Computes data statistics from a BigQuery query result.
  
    Args:
      query: The BigQuery query.
      output_path: The file path to output data statistics result to. 
        It will be a TFRecord file containing a single
        data statistics proto, and can be read with the 'load_statistics' API.
        If you run this function on Google Cloud, you must specify an
        output_path. Specifying None may cause an error.
      schema: A Schema protobuf to use for data validation
      stats_options: `tfdv.StatsOptions` for generating data statistics.
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    Returns:
      A DatasetFeatureStatisticsList proto.
    """

    column_specs = _get_column_specs(query)
    if not validate_bq_types(_get_column_specs(query).values()):
        raise ValueError("Unsupported BigQuery data types.")

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter

    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)

    with beam.Pipeline(options=pipeline_options) as p:
        stats = (
            p
            | 'GetData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True))
            #        | 'DecodeData' >>  DecodeBigQuery(column_specs,
            #                                          desired_batch_size=batch_size)
            | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables()
            | 'GenerateStatistics' >> tfdv.GenerateStatistics())

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))
        _ = (stats
             | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics,
                                                schema=schema)
             | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                 file_path_prefix=anomalies_output_path,
                 shard_name_template='',
                 append_trailing_newlines=False))
예제 #24
0
  def _load_data(
      self,
      partitions_using_temp_tables,
      partitions_direct_to_destination,
      load_job_name_pcv,
      schema_mod_job_name_pcv,
      copy_job_name_pcv,
      p,
      step_name):
    """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
    # Load data using temp tables
    trigger_loads_outputs = (
        partitions_using_temp_tables
        | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=True,
                additional_bq_parameters=self.additional_bq_parameters,
                source_format=self._temp_file_format,
                step_name=step_name,
                load_job_project_id=self.load_job_project_id),
            load_job_name_pcv,
            *self.schema_side_inputs).with_outputs(
                TriggerLoadJobs.TEMP_TABLES, main='main'))

    temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
    temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

    finished_temp_tables_load_jobs_pc = (
        p
        | "ImpulseMonitorLoadJobs" >> beam.Create([None])
        | "WaitForTempTableLoadJobs" >> beam.ParDo(
            WaitForBQJobs(self.test_client),
            pvalue.AsList(temp_tables_load_job_ids_pc)))

    schema_mod_job_ids_pc = (
        finished_temp_tables_load_jobs_pc
        | beam.ParDo(
            UpdateDestinationSchema(
                write_disposition=self.write_disposition,
                test_client=self.test_client,
                additional_bq_parameters=self.additional_bq_parameters,
                step_name=step_name,
                load_job_project_id=self.load_job_project_id),
            schema_mod_job_name_pcv))

    finished_schema_mod_jobs_pc = (
        p
        | "ImpulseMonitorSchemaModJobs" >> beam.Create([None])
        | "WaitForSchemaModJobs" >> beam.ParDo(
            WaitForBQJobs(self.test_client),
            pvalue.AsList(schema_mod_job_ids_pc)))

    destination_copy_job_ids_pc = (
        finished_temp_tables_load_jobs_pc
        | beam.ParDo(
            TriggerCopyJobs(
                create_disposition=self.create_disposition,
                write_disposition=self.write_disposition,
                test_client=self.test_client,
                step_name=step_name,
                load_job_project_id=self.load_job_project_id),
            copy_job_name_pcv,
            pvalue.AsIter(finished_schema_mod_jobs_pc)))

    finished_copy_jobs_pc = (
        p
        | "ImpulseMonitorCopyJobs" >> beam.Create([None])
        | "WaitForCopyJobs" >> beam.ParDo(
            WaitForBQJobs(self.test_client),
            pvalue.AsList(destination_copy_job_ids_pc)))

    _ = (
        p
        | "RemoveTempTables/Impulse" >> beam.Create([None])
        | "RemoveTempTables/PassTables" >> beam.FlatMap(
            lambda _,
            unused_copy_jobs,
            deleting_tables: deleting_tables,
            pvalue.AsIter(finished_copy_jobs_pc),
            pvalue.AsIter(temp_tables_pc))
        | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
        | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
        | "RemoveTempTables/GetTableNames" >> beam.Keys()
        | "RemoveTempTables/Delete" >> beam.ParDo(
            DeleteTablesFn(self.test_client)))

    # Load data directly to destination table
    destination_load_job_ids_pc = (
        partitions_direct_to_destination
        | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=False,
                additional_bq_parameters=self.additional_bq_parameters,
                source_format=self._temp_file_format,
                step_name=step_name,
                load_job_project_id=self.load_job_project_id),
            load_job_name_pcv,
            *self.schema_side_inputs))

    _ = (
        p
        | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None])
        | "WaitForDestinationLoadJobs" >> beam.ParDo(
            WaitForBQJobs(self.test_client),
            pvalue.AsList(destination_load_job_ids_pc)))

    destination_load_job_ids_pc = (
        (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
        | beam.Flatten())

    return destination_load_job_ids_pc, destination_copy_job_ids_pc
예제 #25
0
###### Pipline Beam (Transforms) ############

print(0000000000000000)
# Building a Beam Pipline
p1 = beam.Pipeline(options=pipeline_options)

attendance_count = (
    p1
    | 'read pub_sub' >> beam.io.ReadFromPubSub(
        subscription=input_subscription
    )  #beam.io.ReadFromPubSub(subscription=input_subscription) #, timestamp_attribute

    # timestamp_attribute –
    # Message value to use as element timestamp. If None, uses message publishing time as the timestamp.
    | 'to python dict' >> beam.Map(to_python_dict)
    | 'Filter offline events' >> beam.Filter(lambda element: element['venue'][
        'mode'] == 'offline')  # change to offline
    | 'get venue' >> beam.Map(get_venue)
    | 'build_tuple' >> beam.Map(build_tuple)

    #    | 'ecode' >> beam.Map(lambda x : str(x).encode("utf-8"))
    | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        "totemic-polygon-279515:dataset.meetup",
        schema="geohash:string, mode:string, lat:Float, lon:float")
    #beam.io.WriteToText('ou.txt')
)

print(111111111111111)
# running pipline
result = p1.run()  #
예제 #26
0
    def _get_page_content(self, pipeline, file_paths, dl_manager):
        """Build PCollection of un-split page content."""

        wet_file_paths = pipeline | "create_wet_files" >> beam.Create(
            file_paths["wet_files"])
        if "wet_urls" in file_paths:

            def download_url(url, downloader, pipeline):
                path = downloader.download(url)
                if not pipeline.is_local():
                    path = downloader.ship_files_with_pipeline(path, pipeline)
                return path

            dl_wet_file_paths = (
                pipeline
                | "create_wet_urls" >> beam.Create(file_paths["wet_urls"])
                | beam.Map(
                    download_url, downloader=dl_manager, pipeline=pipeline))
            wet_file_paths = (wet_file_paths,
                              dl_wet_file_paths) | beam.Flatten()

        # Parse WET files and filter by length.
        # Output: url, text
        page_content = wet_file_paths | beam.FlatMap(
            split_wet_file) | beam.Filter(is_valid_length)

        # Optionally filter for RealNews domains.
        # Output: url, text
        if self.config.realnewslike:
            with open(file_paths["realnews_domains"], "r") as f:
                realnews_domains = json.load(f)
            page_content = page_content | beam.Filter(is_realnews_domain,
                                                      realnews_domains)

        # Normalize and deduplicate by URL.
        # Output: url, text
        page_content = (page_content
                        | "normalize_url" >> beam.Map(normalize_url)
                        | "group_url" >> beam.GroupByKey()
                        | beam.Map(dedupe_urls))

        # Optionally filter for WebText-like URLs.
        # Output: url, text
        if self.config.webtextlike:
            webtextlike_urls = (
                pipeline
                | "read_webtextlike_urls" >> beam.io.ReadFromText(
                    os.path.join(file_paths["openwebtext_urls_zip"],
                                 _OPENWEBTEXT_URLS_FILE_PATTERN))
                | "add_dummy_page" >> beam.Map(lambda x: (x, ""))
                | "normal_webtext_url" >> beam.Map(normalize_url))
            page_content = ({
                "text": page_content,
                "webtextlike_urls": webtextlike_urls
            }
                            | "group_webtextlike_urls" >> beam.CoGroupByKey()
                            | beam.FlatMap(filter_by_webtextlike))

        # Optionally clean pages of badwords, boilerpolate text, and duplicate
        # spans of sentences.
        # Output: url, text
        if self.config.clean:
            with open(file_paths["badwords"], "r") as f:
                badwords = [l.strip() for l in f]
            page_content = page_content | "clean_pages" >> beam.FlatMap(
                get_clean_page_fn(badwords))
            page_content = remove_duplicate_text(page_content)

        # Optionally filter out non-`language` pages. We do this after cleaning
        # since it may change the predominate language.
        if self.config.lang != "all":
            page_content |= beam.Filter(is_language, language=self.config.lang)

        return page_content
예제 #27
0
파일: nexmark_util.py 프로젝트: xbetox/beam
 def expand(self, pcoll):
   return (
       pcoll
       | 'window' >> beam.WindowInto(window.GlobalWindows())
       | "Count" >> beam.combiners.Count.Globally()
       | "Log" >> beam.Map(log_count_info))
예제 #28
0
  def test_able_to_cache_intermediate_unbounded_source_pcollection(self):
    """Tests being able to cache an intermediate source PCollection.

    In the following pipeline, the source doesn't have a reference and so is
    not automatically cached in the watch() command. This tests that this case
    is taken care of.
    """
    # Create a new interactive environment to make the test idempotent.
    ie.new_env(cache_manager=streaming_cache.StreamingCache(cache_dir=None))

    # Create the pipeline that will be instrumented.
    from apache_beam.options.pipeline_options import StandardOptions
    options = StandardOptions(streaming=True)
    p_original = beam.Pipeline(interactive_runner.InteractiveRunner(), options)

    # pylint: disable=possibly-unused-variable
    source_1 = (
        p_original
        | 'source1' >> beam.io.ReadFromPubSub(
            subscription='projects/fake-project/subscriptions/fake_sub')
        | beam.Map(lambda e: e))

    # Watch but do not cache the PCollections.
    ib.watch(locals())

    # Make sure that sources without a user reference are still cached.
    instr.watch_sources(p_original)

    intermediate_source_pcoll = None
    for watching in ie.current_env().watching():
      watching = list(watching)
      for var, watchable in watching:
        if 'synthetic' in var:
          intermediate_source_pcoll = watchable
          break

    # Instrument the original pipeline to create the pipeline the user will see.
    p_copy = beam.Pipeline.from_runner_api(
        p_original.to_runner_api(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)
    instrumenter = instr.build_pipeline_instrument(p_copy)
    actual_pipeline = beam.Pipeline.from_runner_api(
        proto=instrumenter.instrumented_pipeline_proto(),
        runner=interactive_runner.InteractiveRunner(),
        options=options)

    # Now, build the expected pipeline which replaces the unbounded source with
    # a TestStream.
    intermediate_source_pcoll_cache_key = \
        self.cache_key_of('synthetic_var_' + str(id(intermediate_source_pcoll)),
                     intermediate_source_pcoll)
    p_expected = beam.Pipeline()

    test_stream = (
        p_expected
        | TestStream(output_tags=[intermediate_source_pcoll_cache_key]))
    # pylint: disable=expression-not-assigned
    (
        test_stream
        | 'square1' >> beam.Map(lambda e: e)
        | 'reify' >> beam.Map(lambda _: _)
        | cache.WriteCache(ie.current_env().cache_manager(), 'unused'))

    # Test that the TestStream is outputting to the correct PCollection.
    class TestStreamVisitor(PipelineVisitor):
      def __init__(self):
        self.output_tags = set()

      def enter_composite_transform(self, transform_node):
        self.visit_transform(transform_node)

      def visit_transform(self, transform_node):
        transform = transform_node.transform
        if isinstance(transform, TestStream):
          self.output_tags = transform.output_tags

    v = TestStreamVisitor()
    actual_pipeline.visit(v)
    expected_output_tags = set([intermediate_source_pcoll_cache_key])
    actual_output_tags = v.output_tags
    self.assertSetEqual(expected_output_tags, actual_output_tags)

    # Test that the pipeline is as expected.
    assert_pipeline_proto_equal(
        self,
        p_expected.to_runner_api(use_fake_coders=True),
        instrumenter.instrumented_pipeline_proto())
예제 #29
0
def BatchedInputsToExtracts(  # pylint: disable=invalid-name
    batched_inputs: beam.pvalue.PCollection):
  """Converts Arrow RecordBatch inputs to Extracts."""
  return (batched_inputs
          | 'AddArrowRecordBatchKey' >>
          beam.Map(lambda x: {constants.ARROW_RECORD_BATCH_KEY: x}))
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')

    opts, pipeline_args = parser.parse_known_args()

    options = PipelineOptions(pipeline_args, save_main_session=True)

    if pipeline_args:
        options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
            'no-shuffle-pipeline-', time.time_ns())
    else:
        options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
            'shuffle-pipeline-', time.time_ns())

    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    table_schema = {
        "fields": [{
            "name": "platform",
            "type": "STRING"
        }, {
            "name": "dep_count",
            "type": "INTEGER"
        }]
    }

    input_table = 'bigquery-public-data:libraries_io.dependencies'
    output_table = f"{opts.project}:dataflow_demos.shuffle_demo"

    p = beam.Pipeline(options=options)

    (p | 'ReadFromBQ' >> beam.io.ReadFromBigQuery(table=input_table)
     | 'ExtractPlatform' >> beam.FlatMap(extract_platform)
     | 'CountPerPlatform' >> CountPerPlatform()
     | 'ToDict' >> beam.Map(to_dict)
     | 'WriteToBQ' >> beam.io.WriteToBigQuery(
         output_table,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()