コード例 #1
0
    def testPredictExtractorWithSequentialKerasModel(self):
        # Note that the input will be called 'test_input'
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(1,
                                  activation=tf.nn.sigmoid,
                                  input_shape=(2, ),
                                  name='test')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'test_input': [[0.0, 0.0], [1.0, 1.0]]}
        labels = [[1], [0]]
        example_weights = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "test"
              value {
                dense_tensor {
                  column_name: "test"
                  shape { dim { size: 2 } }
                }
              }
            }
          }
        }
        feature {
          name: "test"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        # Notice that the features are 'test' but the model expects 'test_input'.
        # This tests that the PredictExtractor properly handles this case.
        examples = [
            self._makeExample(
                test=[0.0,
                      0.0], non_model_feature=0),  # should be ignored by model
            self._makeExample(
                test=[1.0,
                      1.0], non_model_feature=1),  # should be ignored by model
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
コード例 #2
0
def TEST_BeamPipeline(graph_name, output_names, feed_dicts):
    """Execute the Beam Pipeline.
    
    Args:
        graph_name: the graph to execute
        output_name: the name of the output inside the graph
        feed_dicts: a list of {graph_name: {placeholder_input: value}}
        
    Returns:
        None. Save the result into a file.
    """
    op_to_filename = {
        'main': './graphdefs/main_graph.pb',
        'remote_op_a': './graphdefs/graph_a.pb',
        'remote_op_b': './graphdefs/graph_b.pb',
    }
    op_to_outputs = {
        'main': ['Add'],
        'remote_op_b': ['Add_1'],
        'remote_op_a': ['embedding_lookup/Identity'],
    }

    test = GraphPartition(op_to_filename, op_to_outputs)
    test.partition()
    """Define your feed_dict again.
    
    These relations are stored inside PyFunc, but we don't have the access.
    {graph_name: {remote op name: {placeholder name inside subgraph: input name}}}
    """
    op_to_remote_op_name_mapping = {
        'main': {
            'remote_op_a': {
                'ids_a': 'ids1'
            },
            'remote_op_b': {
                'ids_b1': 'ids1',
                'ids_b2': 'ids2'
            },
            'remote_op_a_1': {
                'ids_a': 'FloorMod'
            }
        },
        'remote_op_b': {
            'remote_op_a': {
                'ids_a': 'FloorMod'
            },
            'remote_op_a_1': {
                'ids_a': 'ids_b2'
            }
        },
    }

    options = beam.options.pipeline_options.PipelineOptions()
    with beam.Pipeline(options=options) as p:

        inputs = p | 'read' >> beam.Create(feed_dicts)

        outputs = inputs | 'Graph' >> ExecuteOneGraph(
            test.op_to_execution_bundles, op_to_remote_op_name_mapping,
            graph_name)

        class GetOutput(beam.DoFn):
            def process(self, element, graph_name, output_names):
                outputs = []
                for output_name in output_names:
                    outputs.append(element[graph_name][output_name])
                yield outputs

        outputs = outputs | beam.ParDo(GetOutput(), graph_name, output_names)
        outputs | 'output' >> beam.io.WriteToText('./beam_experiment')

        result = p.run()
        result.wait_until_finish()
コード例 #3
0
# !head -n 20 data/wordcount-00000-of-00001

# !rm -Rf data/wordco*

# !ls -la output

p2 = beam.Pipeline()

# list/array =[]
# set = ()
# dictionary = {}

lines = (
    p2
     | beam.Create(['Using create transform ',
                    'to generate in memory data ',
                    'This is the 3rd line ',
                    'Thanks'])
     
     | beam.io.WriteToText('data/outCreate1')
)

p2.run()

# visualize output
# !head -n 20 data/outCreate1-00000-of-00001
# !cat data/both-00000-of-00001

p3 = beam.Pipeline()

lines3 = (
    p3
コード例 #4
0
ファイル: cached_impl_test.py プロジェクト: zakiali/transform
  def test_single_phase_run_twice(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      _ = tft.vocabulary(inputs['s'])

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
      }

    input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.FixedLenFeature([], tf.float32),
            'y': tf.FixedLenFeature([], tf.float32),
            's': tf.FixedLenFeature([], tf.string),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'a',
        }, {
            'x': 4,
            'y': -4,
            's': 'a',
        }],
        span_1_key: input_data,
    }
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      with beam.Pipeline() as p:

        flat_data = p | 'CreateInputData' >> beam.Create(
            list(itertools.chain(*input_data_dict.values())))

        # This is needed due to b/123895600.
        for a, b in six.iteritems(input_data_dict):
          input_data_dict[a] = p | a >> beam.Create(b)

        transform_fn, cache_output = (
            (flat_data, input_data_dict, {}, input_metadata)
            | 'Analyze' >>
            (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
        _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
            self._cache_dir)

        transformed_dataset = (((input_data_dict[span_1_key], input_metadata),
                                transform_fn)
                               | 'Transform' >> beam_impl.TransformDataset())

        transformed_data, unused_transformed_metadata = transformed_dataset

        expected_transformed_data = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
            },
        ]
        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='first')

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)

        for key in input_data_dict:
          self.assertIn(key, cache_output)
          self.assertEqual(6, len(cache_output[key]))

        transform_fn, second_output_cache = (
            (flat_data, input_data_dict, cache_output, input_metadata)
            | 'AnalyzeAgain' >>
            (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)

        transformed_dataset = (
            ((input_data_dict[span_1_key], input_metadata), transform_fn)
            | 'TransformAgain' >> beam_impl.TransformDataset())
    transformed_data, unused_transformed_metadata = transformed_dataset
    beam_test_util.assert_that(
        transformed_data,
        beam_test_util.equal_to(expected_transformed_data),
        label='second')

    self.assertFalse(second_output_cache)
コード例 #5
0
        g_idx_dict['vwidth'] = int(lines[2].strip())
        g_idx_dict['id'] = lines[3].strip()
        f_sfd = open(os.path.join(cur_font_path,
                                  '{}_{:02d}.sfd'.format(font_id, char_id)),
                     mode='rb')
        g_idx_dict['sfd'] = f_sfd.read()
        g_idx_dict['binary_fp'] = lines[4].strip()
        # print(g_idx_dict)
        glyph_list.append(g_idx_dict)
        char_des.close()
        f_sfd.close()

with open(glyph_list_path, 'wb') as f:
    pickle.dump(glyph_list, f)
print("Processed all font files")
# else:
#     with open(glyph_list_path, 'rb') as f:
#         glyph_list = pickle.load(f)
#     print("Loaded processed font files")
#
print('Submitting to beam ...')

with beam.Pipeline() as p:
    records = p | 'Read' >> beam.Create(glyph_list)
    _ = records | 'Write' >> beam.io.WriteToParquet(
        target_beam_parquetio_file_prefix,
        pyarrow.schema([('uni', pyarrow.int64()), ('width', pyarrow.int64()),
                        ('vwidth', pyarrow.int64()), ('sfd', pyarrow.string()),
                        ('id', pyarrow.string()),
                        ('binary_fp', pyarrow.string())]))
コード例 #6
0
  def testSquaredPearsonCorrelationWithWeights(self):
    computations = (
        squared_pearson_correlation.SquaredPearsonCorrelation().computations(
            example_weighted=True))
    metric = computations[0]

    example1 = {
        'labels': np.array([1.0]),
        'predictions': np.array([1.0]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([4.0]),
        'predictions': np.array([2.0]),
        'example_weights': np.array([2.0]),
    }
    example3 = {
        'labels': np.array([3.0]),
        'predictions': np.array([3.0]),
        'example_weights': np.array([3.0]),
    }
    example4 = {
        'labels': np.array([3.0]),
        'predictions': np.array([4.0]),
        'example_weights': np.array([4.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric.keys[0]
          # 1: prediction = 1, label = 1
          # 2: prediction = 2, label = 4
          # 3: prediction = 3, label = 3
          # 4: prediction = 4, label = 3
          #
          # pred_x_labels = 1x1x1 + 2x2x4 + 3x3x3 + 4x4x3 = 92
          # labels = 1x1 + 2x4 + 3x3 + 4x3 = 30
          # preds = 1 + 2x2 + 3x3 + 4x4= 30
          # sq_labels = 1x1x1 + 2x4x4+ 3x3x3 + 4x3x3 = 96
          # sq_preds = 1x1x1 + 2x2x2 + 3x3x3 + 4x4x4 = 100
          # examples = 1 + 2 + 3 + 4 = 10
          #
          # r^2 = (92 - 30 * 30 / 10)^2 / (100 - 30^2 / 10) * (96 - 30^2 / 10)
          # r^2 = 4 / (10 * 6) = 0.06667
          self.assertDictElementsAlmostEqual(
              got_metrics, {key: 0.06667}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
コード例 #7
0
  def expand(self, pcoll):
    p = pcoll.pipeline

    temp_location = p.options.view_as(GoogleCloudOptions).temp_location

    empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
    singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

    load_job_name_pcv = pvalue.AsSingleton(
        singleton_pc
        | beam.Map(lambda _: _generate_load_job_name()))

    file_prefix_pcv = pvalue.AsSingleton(
        singleton_pc
        | "GenerateFilePrefix" >> beam.Map(
            file_prefix_generator(self._validate,
                                  self._custom_gcs_temp_location,
                                  temp_location)))

    destination_data_kv_pc = (
        pcoll
        | "RewindowIntoGlobal" >> self._window_fn()
        | "AppendDestination" >> beam.ParDo(bigquery_tools.AppendDestinationsFn(
            self.destination), *self.table_side_inputs))

    all_destination_file_pairs_pc = self._write_files(destination_data_kv_pc,
                                                      file_prefix_pcv)

    grouped_files_pc = (
        all_destination_file_pairs_pc
        | "GroupFilesByTableDestinations" >> beam.GroupByKey())

    partitions = (grouped_files_pc
                  | beam.ParDo(PartitionFiles(self.max_partition_size,
                                              self.max_files_per_partition))
                  .with_outputs(PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                PartitionFiles.SINGLE_PARTITION_TAG))

    multiple_partitions_per_destination_pc = partitions[
        PartitionFiles.MULTIPLE_PARTITIONS_TAG]
    single_partition_per_destination_pc = partitions[
        PartitionFiles.SINGLE_PARTITION_TAG]

    # When using dynamic destinations, elements with both single as well as
    # multiple partitions are loaded into BigQuery using temporary tables to
    # ensure atomicity.
    if self.dynamic_destinations:
      all_partitions = ((multiple_partitions_per_destination_pc,
                         single_partition_per_destination_pc)
                        | "FlattenPartitions" >> beam.Flatten())
      destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
        _load_data(all_partitions, empty_pc, load_job_name_pcv,
                   singleton_pc)
    else:
      destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
        _load_data(multiple_partitions_per_destination_pc,
                   single_partition_per_destination_pc,
                   load_job_name_pcv, singleton_pc)

    return {
        self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
        self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
        self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
    }
コード例 #8
0
    def test_big_query_write_new_types(self):
        table_name = 'python_new_types_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        input_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in row_data.items():
            input_data.append({key: value})

        table_schema = {
            "fields": [{
                "name": "float",
                "type": "FLOAT"
            }, {
                "name": "numeric",
                "type": "NUMERIC"
            }, {
                "name": "bytes",
                "type": "BYTES"
            }, {
                "name": "date",
                "type": "DATE"
            }, {
                "name": "time",
                "type": "TIME"
            }, {
                "name": "datetime",
                "type": "DATETIME"
            }, {
                "name": "timestamp",
                "type": "TIMESTAMP"
            }, {
                "name": "geo",
                "type": "GEOGRAPHY"
            }]
        }

        expected_row = (
            0.33,
            Decimal('10'),
            b'\xab\xac',
            datetime.date(3000, 12, 31),
            datetime.time(23, 59, 59),
            datetime.datetime(2018, 12, 31, 12, 44, 31),
            datetime.datetime(2018,
                              12,
                              31,
                              12,
                              44,
                              31,
                              744957,
                              tzinfo=pytz.utc),
            'POINT(30 10)',
        )

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for i, value in enumerate(expected_row):
            row = [None] * len(expected_row)
            row[i] = value
            expected_data.append(tuple(row))

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query='SELECT float, numeric, bytes, date, time, datetime,'
                'timestamp, geo FROM %s' % table_id,
                data=expected_data)
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #9
0
    def test_big_query_write_without_schema(self):
        table_name = 'python_no_schema_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io expects bytes to be base64 encoded values
        for row in input_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT bytes, date, time FROM %s" % table_id,
                data=[(
                    b'xyw',
                    datetime.date(2011, 1, 1),
                    datetime.time(23, 59, 59, 999999),
                ), (
                    b'abc',
                    datetime.date(2000, 1, 1),
                    datetime.time(0, 0, 0),
                ),
                      (
                          b'\xe4\xbd\xa0\xe5\xa5\xbd',
                          datetime.date(3000, 12, 31),
                          datetime.time(23, 59, 59),
                      ),
                      (
                          b'\xab\xac\xad',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 temp_file_format=FileFormat.JSON))
コード例 #10
0
    def _build_pcollection(self, pipeline, filepaths, language):
        """Build PCollection of examples in the raw (text) form."""
        def _extract_content(filepath):
            """Extracts article content from a single WikiMedia XML file."""
            logging.info("generating examples from = %s", filepath)
            with open(filepath, "rb") as f:
                f = bz2.BZ2File(filename=f)
                if six.PY3:
                    # Workaround due to:
                    # https://github.com/tensorflow/tensorflow/issues/33563
                    utf_f = codecs.getreader("utf-8")(f)
                else:
                    utf_f = f

                # To clear root, to free-up more memory than just `elem.clear()`.
                context = etree.iterparse(utf_f, events=("end", ))
                context = iter(context)
                unused_event, root = next(context)
                for unused_event, elem in context:
                    if not elem.tag.endswith("page"):
                        continue
                    namespace = elem.tag[:-4]
                    title = elem.find("./{0}title".format(namespace)).text
                    ns = elem.find("./{0}ns".format(namespace)).text
                    id_ = elem.find("./{0}id".format(namespace)).text

                    # Filter pages that are not in the "main" namespace.
                    if ns != "0":
                        root.clear()
                        continue

                    raw_content = elem.find(
                        "./{0}revision/{0}text".format(namespace)).text
                    root.clear()

                    # Filter redirects.
                    if raw_content is None or raw_content.lower().startswith(
                            "#redirect"):
                        beam.metrics.Metrics.counter(
                            language, "filtered-redirects").inc()
                        continue

                    beam.metrics.Metrics.counter(language,
                                                 "extracted-examples").inc()
                    yield (id_, title, raw_content)

        def _clean_content(inputs):
            """Cleans raw wikicode to extract text."""
            id_, title, raw_content = inputs
            try:
                text = _parse_and_clean_wikicode(raw_content)
            except (mwparserfromhell.parser.ParserError) as e:
                beam.metrics.Metrics.counter(language, "parser-error").inc()
                logging.error("mwparserfromhell ParseError: %s", e)
                return

            if not text:
                beam.metrics.Metrics.counter(language,
                                             "empty-clean-examples").inc()
                return

            beam.metrics.Metrics.counter(language, "cleaned-examples").inc()

            yield id_, {"title": title, "text": text}

        return (pipeline
                | beam.Create(filepaths)
                | beam.FlatMap(_extract_content)
                | beam.transforms.Reshuffle()
                | beam.FlatMap(_clean_content))
コード例 #11
0
    def test_big_query_write(self):
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
            {
                'number': 3,
                'str': u'你好'
            },
            {
                'number': 4,
                'str': u'привет'
            },
        ]
        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER"
            }, {
                "name": "str",
                "type": "STRING"
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      ), (
                                          3,
                                          u'你好',
                                      ), (
                                          4,
                                          u'привет',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #12
0
ファイル: task.py プロジェクト: zhoufek/beam
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# beam-playground:
#   name: HelloBeam
#   description: Task from katas to create a simple pipeline that takes a hardcoded input element "Hello Beam".
#   multifile: false
#   context_line: 30
#   categories:
#     - Testing
#     - Quickstart

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

    (p | beam.Create(['Hello Beam']) | LogElements())
コード例 #13
0
    def testBatchSizeLimit(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier(
            None, temp_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        schema = text_format.Parse(
            """
        feature {
          name: "classes"
          type: BYTES
        }
        feature {
          name: "scores"
          type: FLOAT
        }
        feature {
          name: "labels"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = []
        for _ in range(4):
            examples.append(
                self._makeExample(classes='first', scores=0.0, labels='third'))

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
コード例 #14
0
    def testBatchSizeLimitWithKerasModel(self):
        input1 = tf.keras.layers.Input(shape=(1, ),
                                       batch_size=1,
                                       name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ),
                                       batch_size=1,
                                       name='input2')

        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)

        def add_1(tensor):
            return tf.add_n([tensor, tf.constant(1.0, shape=(1, 2))])

        assert_layer = tf.keras.layers.Lambda(add_1)(input_layer)

        model = tf.keras.models.Model(inputs, assert_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input1"
              value {
                dense_tensor {
                  column_name: "input1"
                  shape { dim { size: 1 } }
                }
              }
            }
            tensor_representation {
              key: "input2"
              value {
                dense_tensor {
                  column_name: "input2"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input1"
          type: FLOAT
        }
        feature {
          name: "input2"
          type: FLOAT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = []
        for _ in range(4):
            examples.append(self._makeExample(input1=0.0, input2=1.0))

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter
            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
コード例 #15
0
#   Licensed to the Apache Software Foundation (ASF) under one
#   or more contributor license agreements.  See the NOTICE file
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

    (p | beam.Create(range(1, 11))
     | beam.combiners.Top.Largest(2)
     | LogElements())
コード例 #16
0
    def test_big_query_write_temp_table_append_schema_update(self):
        """
    Test that schema update options are respected when appending to an existing
    table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            "int64": num,
            "bool": True,
            "nested_field": {
                "fruit": "Apple"
            }
        } for num in range(1, 3)]

        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64"
            }, {
                "name": "bool",
                "type": "BOOL"
            }, {
                "name":
                "nested_field",
                "type":
                "RECORD",
                "mode":
                "REPEATED",
                "fields": [
                    {
                        "name": "fruit",
                        "type": "STRING",
                        "mode": "NULLABLE"
                    },
                ]
            }]
        }

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query="""
            SELECT bytes, date, time, int64, bool, fruit 
            FROM %s,
            UNNEST(nested_field) as nested_field
            ORDER BY int64
            """ % table_id,
                data=[(None, None, None, num, True, "Apple")
                      for num in range(1, 3)]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION']
                 }))
コード例 #17
0
  def testSquaredPearsonCorrelationWithoutWeights(self):
    computations = (
        squared_pearson_correlation.SquaredPearsonCorrelation().computations())
    metric = computations[0]

    example1 = {
        'labels': np.array([2.0]),
        'predictions': np.array([1.0]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([1.0]),
        'predictions': np.array([2.0]),
        'example_weights': np.array([1.0]),
    }
    example3 = {
        'labels': np.array([2.0]),
        'predictions': np.array([3.0]),
        'example_weights': np.array([1.0]),
    }
    example4 = {
        'labels': np.array([3.0]),
        'predictions': np.array([4.0]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric.keys[0]
          # 1: prediction = 1, label = 2
          # 2: prediction = 2, label = 1
          # 3: prediction = 3, label = 2
          # 4: prediction = 4, label = 3
          #
          # pred_x_labels = 2 + 2 + 6 + 12 = 22
          # labels = 2 + 1 + 2 + 3 =  8
          # preds = 1 + 2 + 3 + 4 = 10
          # sq_labels = 4 + 1 + 4 + 9 = 18
          # sq_preds = 1 + 4 + 9 + 16 = 30
          # examples = 4
          #
          # r^2 = (22 - 8 * 10 / 4)^2 / (30 - 10^2 / 4) * (18 - 8^2 / 4)
          # r^2 = 4 / (5 * 2) = 0.4
          self.assertDictElementsAlmostEqual(got_metrics, {key: 0.4}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
コード例 #18
0
    def run(self, pipeline: pipeline_pb2.Pipeline) -> None:
        """Deploys given logical pipeline on Beam.

    Args:
      pipeline: Logical pipeline in IR format.
    """
        # For CLI, while creating or updating pipeline, pipeline_args are extracted
        # and hence we avoid deploying the pipeline.
        if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
            return

        # TODO(b/163003901): Support beam DAG runner args through IR.
        deployment_config = self._extract_deployment_config(pipeline)
        connection_config = deployment_config.metadata_connection_config
        mlmd_connection = metadata.Metadata(
            connection_config=connection_config)

        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}):
            with beam.Pipeline() as p:
                # Uses for triggering the component DoFns.
                root = p | 'CreateRoot' >> beam.Create([None])

                # Stores mapping of component to its signal.
                signal_map = {}
                # pipeline.components are in topological order.
                for node in pipeline.nodes:
                    # TODO(b/160882349): Support subpipeline
                    pipeline_node = node.pipeline_node
                    component_id = pipeline_node.node_info.id
                    executor_spec = self._extract_executor_spec(
                        deployment_config, component_id)
                    custom_driver_spec = self._extract_custom_driver_spec(
                        deployment_config, component_id)

                    # Signals from upstream components.
                    signals_to_wait = []
                    for upstream_node in pipeline_node.upstream_nodes:
                        assert upstream_node in signal_map, (
                            'Components is not in '
                            'topological order')
                        signals_to_wait.append(signal_map[upstream_node])
                    logging.info(
                        'Component %s depends on %s.', component_id,
                        [s.producer.full_label for s in signals_to_wait])

                    # Each signal is an empty PCollection. AsIter ensures component will
                    # be triggered after upstream components are finished.
                    # LINT.IfChange
                    signal_map[component_id] = (
                        root
                        | 'Run[%s]' % component_id >> beam.ParDo(
                            _PipelineNodeAsDoFn(
                                pipeline_node=pipeline_node,
                                mlmd_connection=mlmd_connection,
                                pipeline_info=pipeline.pipeline_info,
                                pipeline_runtime_spec=pipeline.runtime_spec,
                                executor_spec=executor_spec,
                                custom_driver_spec=custom_driver_spec), *
                            [beam.pvalue.AsIter(s) for s in signals_to_wait]))
                    # LINT.ThenChange(../beam/beam_dag_runner.py)
                    logging.info('Component %s is scheduled.', component_id)
コード例 #19
0
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

# beam-playground:
#   name: CompositeTransform
#   description: Task from katas to implement a composite transform "ExtractAndMultiplyNumbers"
#     that extracts numbers from comma separated line and then multiplies each number by 10.
#   multifile: false
#   pipeline_options:
#   categories:
#     - Flatten

import apache_beam as beam

from log_elements import LogElements


class ExtractAndMultiplyNumbers(beam.PTransform):
    def expand(self, pcoll):
        return (pcoll
                | beam.FlatMap(lambda line: map(int, line.split(',')))
                | beam.Map(lambda num: num * 10))


with beam.Pipeline() as p:

    (p | beam.Create(['1,2,3,4,5', '6,7,8,9,10'])
     | ExtractAndMultiplyNumbers()
     | LogElements())
コード例 #20
0
  def testMetricsWithoutWeights(self, metric_name, expected_value):
    # TODO (b/151636380): remove when CL/299961405 is propagated through Kokoro.
    if metric_name == 'specificity_at_sensitivity':
      fix_present = hasattr(tf.keras.metrics.SpecificityAtSensitivity,
                            '_find_max_under_constraint')
      if not fix_present:
        expected_value = 0.5
    computations = tf_metric_wrapper.tf_metric_computations(
        [self._tf_metric_by_name(metric_name)])
    histogram = computations[0]
    matrix = computations[1]
    metric = computations[2]

    example1 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.0]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.5]),
        'example_weights': np.array([1.0]),
    }
    example3 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.3]),
        'example_weights': np.array([1.0]),
    }
    example4 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.9]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
          | 'ComputeConfusionMatrix' >> beam.Map(
              lambda x: (x[0], matrix.result(x[1])))  # pyformat: disable
          | 'ComputeMetric' >> beam.Map(
              lambda x: (x[0], metric.result(x[1]))))  # pyformat: disable

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name=metric_name)
          self.assertDictElementsAlmostEqual(
              got_metrics, {key: expected_value}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
コード例 #21
0
ファイル: cached_impl_test.py プロジェクト: zakiali/transform
  def test_single_phase_mixed_analyzer_run_once(self):
    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):

      integerized_s = tft.compute_and_apply_vocabulary(inputs['s'])

      _ = tft.bucketize(inputs['x'], 2, name='bucketize')

      return {
          'integerized_s':
              integerized_s,
          'x_min':
              tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'x_mean':
              tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']),
          'y_min':
              tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
          'y_mean':
              tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']),
      }

    # Run AnalyzeAndTransform on some input data and compare with expected
    # output.
    input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}]
    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.FixedLenFeature([], tf.float32),
            'y': tf.FixedLenFeature([], tf.float32),
            's': tf.FixedLenFeature([], tf.string),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
            'y': 1,
            's': 'b',
        }, {
            'x': 4,
            'y': -4,
            's': 'b',
        }],
        span_1_key: input_data,
    }

    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      with beam.Pipeline() as p:

        flat_data = p | 'CreateInputData' >> beam.Create(
            list(itertools.chain(*input_data_dict.values())))

        # TODO(b/37788560): Get these names programmatically.
        cache_dict = {
            span_0_key: {
                '__v0__CacheableCombineAccumulate--x_1-mean_and_var--':
                    p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0]']),
                '__v0__CacheableCombineAccumulate--x-x--':
                    p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']),
                '__v0__CacheableCombineAccumulate--y_1-mean_and_var--':
                    p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25]']),
                '__v0__CacheableCombineAccumulate--y-y--':
                    p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']),
            },
            span_1_key: {},
        }

        transform_fn, cache_output = (
            (flat_data, input_data_dict, cache_dict, input_metadata)
            | 'Analyze' >>
            (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
        _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
            self._cache_dir)

        transformed_dataset = (((input_data_dict[span_1_key], input_metadata),
                                transform_fn)
                               | 'Transform' >> beam_impl.TransformDataset())

        dot_string = nodes.get_dot_graph(
            [analysis_graph_builder._ANALYSIS_GRAPH]).to_string()
        self.WriteRenderedDotFile(dot_string)

        transformed_data, unused_transformed_metadata = transformed_dataset

        expected_transformed = [
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
            {
                'x_mean': 6.0,
                'x_min': -2.0,
                'y_mean': -0.25,
                'y_min': -4.0,
                'integerized_s': 0,
            },
        ]
        beam_test_util.assert_that(
            transformed_data, beam_test_util.equal_to(expected_transformed))

        transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn')
        _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
コード例 #22
0
  def testMultiClassMetricsUsingConfusionMatrix(self, metric_name, top_k,
                                                expected_value):
    computations = tf_metric_wrapper.tf_metric_computations(
        [self._tf_metric_by_name(metric_name)],
        sub_key=metric_types.SubKey(top_k=top_k))
    histogram = computations[0]
    matrix = computations[1]
    metric = computations[2]

    # top_k = 2
    #   TP = 0.5*0 + 0.7*1 + 0.9*1 + 0.3*0 = 1.6
    #   FP = 0.5*2 + 0.7*1 + 0.9*1 + 0.3*2 = 3.2
    #   FN = 0.5*1 + 0.7*0 + 0.9*0 + 0.3*1 = 0.8
    #
    # top_k = 3
    #   TP = 0.5*0 + 0.7*1 + 0.9*1 + 0.3*1 = 1.9
    #   FP = 0.5*3 + 0.7*2 + 0.9*2 + 0.3*2 = 5.3
    #   FN = 0.5*1 + 0.7*0 + 0.9*0 + 0.3*0 = 0.5
    example1 = {
        'labels': np.array([2]),
        'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]),
        'example_weights': np.array([0.5]),
    }
    example2 = {
        'labels': np.array([1]),
        'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]),
        'example_weights': np.array([0.7]),
    }
    example3 = {
        'labels': np.array([3]),
        'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]),
        'example_weights': np.array([0.9]),
    }
    example4 = {
        'labels': np.array([1]),
        'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]),
        'example_weights': np.array([0.3]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
          | 'ComputeConfusionMatrix' >> beam.Map(
              lambda x: (x[0], matrix.result(x[1])))  # pyformat: disable
          | 'ComputeMetric' >> beam.Map(
              lambda x: (x[0], metric.result(x[1]))))  # pyformat: disable

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(
              name=metric_name, sub_key=metric_types.SubKey(top_k=top_k))
          self.assertDictElementsAlmostEqual(
              got_metrics, {key: expected_value}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
コード例 #23
0
ファイル: cached_impl_test.py プロジェクト: zakiali/transform
  def test_caching_vocab_for_integer_categorical(self):

    span_0_key = 'span-0'
    span_1_key = 'span-1'

    def preprocessing_fn(inputs):
      return {
          'x_vocab':
              tft.compute_and_apply_vocabulary(
                  inputs['x'], frequency_threshold=2)
      }

    input_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'x': tf.FixedLenFeature([], tf.int64),
        }))
    input_data_dict = {
        span_0_key: [{
            'x': -2,
        }, {
            'x': -4,
        }, {
            'x': -1,
        }, {
            'x': 4,
        }],
        span_1_key: [{
            'x': -2,
        }, {
            'x': -1,
        }, {
            'x': 6,
        }, {
            'x': 7,
        }],
    }
    expected_transformed_data = [{
        'x_vocab': 0,
    }, {
        'x_vocab': 1,
    }, {
        'x_vocab': -1,
    }, {
        'x_vocab': -1,
    }]
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      with beam.Pipeline() as p:

        flat_data = p | 'CreateInputData' >> beam.Create(
            list(itertools.chain(*input_data_dict.values())))

        # TODO(b/37788560): Get these names programmatically.
        cache_dict = {
            span_0_key: {
                '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--':
                    p | 'CreateB' >> beam.Create(
                        [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']),
            },
            span_1_key: {},
        }

        transform_fn, cache_output = (
            (flat_data, input_data_dict, cache_dict, input_metadata)
            | 'Analyze' >>
            (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn)))
        _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(
            self._cache_dir)

        transformed_dataset = (((input_data_dict[span_1_key], input_metadata),
                                transform_fn)
                               | 'Transform' >> beam_impl.TransformDataset())

        transformed_data, _ = transformed_dataset

        beam_test_util.assert_that(
            transformed_data,
            beam_test_util.equal_to(expected_transformed_data),
            label='first')
コード例 #24
0
  def testCustomTFMetricWithPadding(self):
    computation = tf_metric_wrapper.tf_metric_computations([
        _CustomMetric(name='custom_label', update_y_pred=False),
        _CustomMetric(name='custom_pred', update_y_pred=True)
    ])[0]

    # label_sum = (1 - 1 - 1 - 1) * 1.0 +
    #             (1 + 2 - 1.0 - 1) * 1.0 +
    #             (1 + 2 + 3 - 1) * 2.0
    #           = 9.0
    #
    # pred_sum = (0.1 + 0.2 + 0.3 + 0.0) * 1.0 +
    #            (0.1 + 0.2 + 0.0 - 1.0) * 1.0 +
    #            (0.1 + 0.2 + 0.3 - 1.0) * 2.0
    #           = -0.9
    #
    # weights_total = (1.0 * 4 + 1.0 * 4 + 2.0 * 4) = 16.0
    example1 = {
        'labels': np.array([1], dtype=np.int64),
        'predictions': np.array([0.1, 0.2, 0.3, 0.0]),
        'example_weights': np.array([1.0])
    }
    example2 = {
        'labels': np.array([1, 2], dtype=np.int64),
        'predictions': np.array([0.1, 0.2, 0.0]),
        'example_weights': np.array([1.0])
    }
    example3 = {
        'labels': np.array([1, 2, 3], dtype=np.int64),
        'predictions': np.array([0.1, 0.2, 0.3]),
        'example_weights': np.array([2.0])
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'Combine' >> beam.CombinePerKey(computation.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())

          custom_label_key = metric_types.MetricKey(name='custom_label')
          custom_pred_key = metric_types.MetricKey(name='custom_pred')
          self.assertDictElementsAlmostEqual(got_metrics, {
              custom_label_key: 9.0 / 16.0,
              custom_pred_key: -0.9 / 16.0
          })

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
コード例 #25
0
def TEST_Subgraph():
    op_to_filename = {
        'main': './graphdefs/main_graph.pb',
        'remote_op_a': './graphdefs/graph_a.pb',
        'remote_op_b': './graphdefs/graph_b.pb',
    }
    op_to_outputs = {
        'main': ['Add'],
        'remote_op_b': ['Add_1'],
        'remote_op_a': ['embedding_lookup/Identity'],
    }

    partition = GraphPartition(op_to_filename, op_to_outputs)
    partition.partition()

    feed_dicts_graph_b = [{
        'remote_op_b': {
            'import/ids_b1:0': 3,
            'import/ids_b2:0': 3
        }
    }, {
        'remote_op_b': {
            'import/ids_b1:0': 10,
            'import/ids_b2:0': 10
        }
    }]

    graph_name = 'remote_op_b'
    output_names = ['import/FloorMod:0']
    bundle = partition.op_to_execution_bundles[graph_name][0]

    options = beam.options.pipeline_options.PipelineOptions()
    with beam.Pipeline(options=options) as p:

        inputs = p | 'read' >> beam.Create(feed_dicts_graph_b)

        outputs = inputs | 'Graph' >> beam.ParDo(ExecuteOneSubgraph(), bundle,
                                                 graph_name)

        class GetOutputs(beam.DoFn):
            def process(self, element, graph_name, output_names):
                result = [
                    element[graph_name][output_name]
                    for output_name in output_names
                ]
                yield result

        outputs = outputs | beam.ParDo(GetOutputs(), graph_name, output_names)
        outputs | 'output' >> beam.io.WriteToText('./beam_experiment')

        result = p.run()
        result.wait_until_finish()

    result_original_model = []
    for feed_dict in feed_dicts_graph_b:
        graph = partition.op_to_graph[graph_name]
        with tf.compat.v1.Session(graph=graph) as sess:
            result_original_model.append(
                sess.run(output_names, feed_dict[graph_name]))

    import subprocess
    result_beam_pipeline = subprocess.check_output(
        ['cat', './beam_experiment-00000-of-00001'])

    print('Results from the original model:', result_original_model)
    print('\nResults from the beam pipeline:', result_beam_pipeline)
コード例 #26
0
  def testWithMixedMetrics(self):
    computations = tf_metric_wrapper.tf_metric_computations([
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy'),
        tf.keras.metrics.MeanSquaredError(name='mse')
    ])

    confusion_histogram = computations[0]
    confusion_matrix = computations[1].result
    confusion_metrics = computations[2].result
    non_confusion_metrics = computations[3]

    example1 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.0]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.5]),
        'example_weights': np.array([1.0]),
    }
    example3 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.3]),
        'example_weights': np.array([1.0]),
    }
    example4 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.9]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      sliced_examples = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x)))

      confusion_result = (
          sliced_examples
          |
          'ComputeHistogram' >> beam.CombinePerKey(confusion_histogram.combiner)
          | 'ComputeConfusionMatrix' >> beam.Map(
              lambda x: (x[0], confusion_matrix(x[1])))  # pyformat: disable
          | 'ComputeMetric' >> beam.Map(
              lambda x: (x[0], confusion_metrics(x[1]))))  # pyformat: disable

      non_confusion_result = (
          sliced_examples
          | 'Combine' >> beam.CombinePerKey(non_confusion_metrics.combiner))

      # pylint: enable=no-value-for-parameter

      def check_confusion_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          auc_key = metric_types.MetricKey(name='auc')
          self.assertDictElementsAlmostEqual(
              got_metrics, {auc_key: 0.75}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      def check_non_confusion_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          mse_key = metric_types.MetricKey(name='mse')
          binary_crossentropy_key = metric_types.MetricKey(
              name='binary_crossentropy')
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  mse_key: 0.1875,
                  binary_crossentropy_key: 0.0
              },
              places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(
          confusion_result, check_confusion_result, label='confusion')
      util.assert_that(
          non_confusion_result,
          check_non_confusion_result,
          label='non_confusion')
コード例 #27
0
 def test_assert_that(self):
   # TODO: figure out a way for runner to parse and raise the
   # underlying exception.
   with self.assertRaises(Exception):
     with self.create_pipeline() as p:
       assert_that(p | beam.Create(['a', 'b']), equal_to(['a']))
コード例 #28
0
ファイル: row_coder_test.py プロジェクト: waabiisaabii/beam
 def test_row_coder_in_pipeine(self):
     with TestPipeline() as p:
         res = (p
                | beam.Create(self.PEOPLE)
                | beam.Filter(lambda person: person.name == "Jon Snow"))
         assert_that(res, equal_to([self.JON_SNOW]))
コード例 #29
0
ファイル: impl.py プロジェクト: ourobouros/transform
  def expand(self, tensor_pcoll_mapping):
    """Converts a dict of statistics to a transform function.

    Args:
      tensor_pcoll_mapping: A dictionary mapping `Tensor`s to a singleton
          PCollection containing a _TensorValue.

    Returns:
      A single-element PCollection containing the directory name with the
          SavedModel.
    """
    transform_fn = (
        self.pipeline
        | 'CreateTransformFn' >> beam.Create([self._saved_model_dir]))

    if not tensor_pcoll_mapping:
      return transform_fn

    # Convert tensor_value_mapping into a DictPCollectionView so it can be
    # passed as a side input to the beam Map below.
    tensor_value_pairs = []
    for name, pcoll in six.iteritems(tensor_pcoll_mapping):
      tensor_value_pairs.append(
          pcoll
          | 'AddName[%s]' % name >> beam.Map(lambda x, name=name: (name, x)))
    tensor_value_mapping = beam.pvalue.AsDict(
        tensor_value_pairs | 'MergeTensorValuePairs' >> beam.Flatten())

    def replace_tensors_with_constant_values(saved_model_dir,
                                             tensor_value_mapping):
      """Replaces specified `Tensor`s with constant values.

      Constants are accepted as Python values; these are automatically
      wrapped in `tf.constant()`.

      This method creates its own temp dir, and is therefore idempotent
      since any retry will use a different temp dir.

      Args:
        saved_model_dir: A SavedModel directory providing a transform
          graph.  The MetaGraphDef and signature are selected from the
          SavedModel using keys defined in `../constants.py` ('transform'
          and 'transform_signature', respectively).
        tensor_value_mapping: a dict of tensor names to values to use in
          place of those tensors.

      Returns:
        The directory name containing the updated SavedModel.

      Raises:
        RuntimeError: if there is no default graph available to which to
          apply the transform.
      """

      graph = tf.Graph()
      with graph.as_default():
        tensor_replacement_map = {}
        for orig_tensor_name, (value,
                               is_asset) in six.iteritems(tensor_value_mapping):
          new_tensor = tf.constant(value)
          if is_asset:
            # Any newly frozen constant tensors containing filenames must be
            # added to the ASSET_FILENAMES collection.
            graph.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, new_tensor)
          tensor_replacement_map[orig_tensor_name] = new_tensor

        with tf.Session(graph=graph) as session:
          temp_dir = _make_unique_temp_dir(self._base_temp_dir)
          input_tensors, output_tensors = (
              saved_transform_io.partially_apply_saved_transform(
                  saved_model_dir, {}, tensor_replacement_map))
          saved_transform_io.write_saved_transform_from_session(
              session, input_tensors, output_tensors, temp_dir)
        return temp_dir

    return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map(
        replace_tensors_with_constant_values,
        tensor_value_mapping=tensor_value_mapping))
コード例 #30
0
    def testPredictExtractorWithRegressionModel(self):
        temp_export_dir = self._getExportDir()
        export_dir, _ = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             temp_export_dir, None))

        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        feature {
          name: "prediction"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        feature {
          name: "fixed_float"
          type: FLOAT
        }
        feature {
          name: "fixed_string"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string2'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=1.0,
                              fixed_string='fixed_string3')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    self.assertIn(constants.BATCHED_PREDICTIONS_KEY, got[0])
                    expected_preds = [0.2, 0.8, 0.5]
                    self.assertAlmostEqual(
                        got[0][constants.BATCHED_PREDICTIONS_KEY],
                        expected_preds)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')