Exemple #1
0
    def _add_metadata(
            self, rows: beam.pvalue.PCollection[Row]
    ) -> beam.pvalue.PCollection[Row]:
        """Add ip metadata to a collection of roundtrip rows.

    Args:
      rows: beam.PCollection[Row]

    Returns:
      PCollection[Row]
      The same rows as above with with additional metadata columns added.
    """

        # PCollection[Tuple[DateIpKey,Row]]
        rows_keyed_by_ip_and_date = (
            rows
            | 'key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key(
                row), row)).with_output_types(Tuple[DateIpKey, Row]))

        # PCollection[DateIpKey]
        # pylint: disable=no-value-for-parameter
        ips_and_dates = (rows_keyed_by_ip_and_date
                         | 'get ip and date keys per row' >>
                         beam.Keys().with_output_types(DateIpKey))

        # PCollection[DateIpKey]
        deduped_ips_and_dates = (
            # pylint: disable=no-value-for-parameter
            ips_and_dates
            | 'dedup' >> beam.Distinct().with_output_types(DateIpKey))

        # PCollection[Tuple[date,List[ip]]]
        grouped_ips_by_dates = (
            deduped_ips_and_dates | 'group by date' >>
            beam.GroupByKey().with_output_types(Tuple[str, Iterable[str]]))

        # PCollection[Tuple[DateIpKey,Row]]
        ips_with_metadata = (grouped_ips_by_dates
                             | 'get ip metadata' >> beam.FlatMapTuple(
                                 self._add_ip_metadata).with_output_types(
                                     Tuple[DateIpKey, Row]))

        # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]]
        grouped_metadata_and_rows = (
            ({
                IP_METADATA_PCOLLECTION_NAME: ips_with_metadata,
                ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date
            }) | 'group by keys' >> beam.CoGroupByKey())

        # PCollection[Row]
        rows_with_metadata = (
            grouped_metadata_and_rows | 'merge metadata with rows' >>
            beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row))

        return rows_with_metadata
def _add_vantage_point_tags(
    rows: beam.pvalue.PCollection[Row],
    ips_with_metadata: beam.pvalue.PCollection[Tuple[DateIpKey, Row]]
) -> beam.pvalue.PCollection[Row]:
    """Add tags for vantage point IPs - resolver name (hostname/control/special) and country

  Args:
      rows: PCollection of measurement rows
      ips_with_metadata: PCollection of dated ips with geo metadata

    Returns:
      PCollection of measurement rows with tag information added to the ip row
  """
    # PCollection[Tuple[DateIpKey,Row]]
    rows_keyed_by_ip_and_date = (
        rows | 'add vp tags: key by ips and dates' >>
        beam.Map(lambda row: (make_date_ip_key(row), row)).with_output_types(
            Tuple[DateIpKey, Row]))

    # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]]
    grouped_metadata_and_rows = (
        ({
            IP_METADATA_PCOLLECTION_NAME: ips_with_metadata,
            ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date
        }) | 'add vp tags: group by keys' >> beam.CoGroupByKey())

    # PCollection[Row]
    rows_with_metadata = (
        grouped_metadata_and_rows | 'add vp tags: merge metadata with rows' >>
        beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row))

    return rows_with_metadata
    def test_flat_map_tuple_wrapper(self):
        def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]:
            return [a, b, c]

        th = beam.FlatMapTuple(tuple_map_fn).get_type_hints()
        self.assertEqual(th.input_types, ((str, str, str), {}))
        self.assertEqual(th.output_types, ((str, ), {}))
def unflatten_rows(
        rows: beam.pvalue.PCollection[Row]) -> beam.pvalue.PCollection[Row]:
    """Unflatten so that each row contains a array of answer IPs

  Args:
    rows: measurement rows with a single recieved ip

  Returns:
    measurement rows aggregated so they have an array of recieved responses
  """
    # PCollection[Tuple[str,Row]]
    keyed_by_measurement_id = (
        rows | 'key by measurement id' >>
        beam.Map(lambda row: (row['measurement_id'], row)).with_output_types(
            Tuple[str, Row]))

    # PCollection[Tuple[str,Iterable[Row]]]
    grouped_by_measurement_id = (
        keyed_by_measurement_id
        | 'group by measurement id' >> beam.GroupByKey())

    # PCollection[Row]
    unflattened_rows = (
        grouped_by_measurement_id | 'unflatten rows' >> beam.FlatMapTuple(
            lambda k, v: _unflatten_satellite(v)).with_output_types(Row))

    return unflattened_rows
Exemple #5
0
    def test_flat_map_tuple_wrapper(self):
        # TODO(BEAM-8662): Also test with a fn that accepts default arguments.
        def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]:
            return [a, b, c]

        th = beam.FlatMapTuple(tuple_map_fn).get_type_hints()
        self.assertEqual(th.input_types, ((str, str, str), {}))
        self.assertEqual(th.output_types, ((str, ), {}))
Exemple #6
0
    def test_flat_map_tuple_wrapper(self):
        # TODO(https://github.com/apache/beam/issues/19961): Also test with a fn
        # that accepts default arguments.
        def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]:
            return [a, b, c]

        th = beam.FlatMapTuple(tuple_map_fn).get_type_hints()
        self.assertEqual(th.input_types, ((str, str, str), {}))
        self.assertEqual(th.output_types, ((str, ), {}))
Exemple #7
0
 def expand(self, pcoll):
     return (
         pcoll
         | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target)
         | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks)
         # prevent undesirable fusion
         # https://stackoverflow.com/a/54131856/809705
         | "Reshuffle" >> beam.Reshuffle()
         | "CopyChunks" >> beam.MapTuple(_copy_chunk)
         # prepare inputs for the next stage (if any)
         | "Finish" >> beam.Distinct())
Exemple #8
0
def execute_pipeline(pipeline, options):
    messages = (
        pipeline
        | "read messages" >> beam.io.ReadFromPubSub(topic=options.input_topic)
        | "parse to messages" >> beam.ParDo(ParseMessage()))
    sessions = (messages
                | "window" >> beam.WindowInto(beam.window.Sessions(15))
                | "add key" >> beam.Map(lambda element:
                                        (element.user, element.products))
                | "group by user" >> beam.GroupByKey()
                | "first flatten" >> beam.FlatMapTuple(flat_function)
                | "second flatten" >> beam.FlatMapTuple(flat_function)
                ) | "log 1" >> beam.ParDo(Log())

    products = (
        sessions
        | "add new key" >> beam.Map(lambda session: (session[1].id,
                                                     (session[1], session[0])))
        |
        "group by product" >> beam.GroupByKey()) | "log 2" >> beam.ParDo(Log())
Exemple #9
0
    def test_flat_map_tuple(self):
        def f(a, b, y=None):
            return a, b, y

        expected = [(1, 2), (3, 4)] | beam.FlatMapTuple(f, y=5)
        actual = [(1, 2), (3, 4)] | threadmap.FlatThreadMapTuple(f, y=5)
        self.assertEqual(expected, actual)

        actual = [(1, 2), (3, 4)] | threadmap.FlatThreadMapTuple(
            f,
            y=5,
            num_threads=None,
        )
        self.assertEqual(expected, actual)
def add_received_ip_tags(
    rows: beam.pvalue.PCollection[Row],
    ips_with_metadata: beam.pvalue.PCollection[Tuple[DateIpKey, Row]]
) -> beam.pvalue.PCollection[Row]:
    """Add tags for answer ips (field received.ip) - asnum, asname, http, cert

  Args:
      rows: PCollection of measurement rows
      ips_with_metadata: PCollection of dated ips with geo metadata

    Returns:
      PCollection of measurement rows with tag information added to the recieved.ip row
  """
    # PCollection[Tuple[DateIpKey,Row]]
    received_keyed_by_ip_and_date = (
        rows | 'key by received ips and dates' >>
        beam.Map(lambda row:
                 (_make_date_received_ip_key(row), row)).with_output_types(
                     Tuple[DateIpKey, Row]))

    # Iterable[PCollection[Tuple[DateIpKey,Row]]]
    partition_by_domain = (received_keyed_by_ip_and_date
                           | 'partition by domain' >> beam.Partition(
                               _get_domain_partition, NUM_DOMAIN_PARTITIONS))

    collections = []
    for i in range(0, NUM_DOMAIN_PARTITIONS):
        elements = partition_by_domain[i]
        # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]]
        grouped_received_metadata_and_rows = (({
            IP_METADATA_PCOLLECTION_NAME: ips_with_metadata,
            ROWS_PCOLLECION_NAME: elements
        }) | f'group by received ip keys {i}' >> beam.CoGroupByKey())

        # PCollection[Row]
        domain_rows_with_tags = (
            grouped_received_metadata_and_rows | f'tag received ips {i}' >>
            beam.FlatMapTuple(lambda k, v: merge_metadata_with_rows(
                k, v, field='received')).with_output_types(Row))

        collections.append(domain_rows_with_tags)

    # PCollection[Row]
    rows_with_tags = (
        collections
        | 'merge domain collections' >> beam.Flatten().with_output_types(Row))

    return rows_with_tags
Exemple #11
0
    def run_beam_pipeline(self, scan_type: str, incremental_load: bool,
                          job_name: str, table_name: str,
                          start_date: Optional[datetime.date],
                          end_date: Optional[datetime.date]) -> None:
        """Run a single apache beam pipeline to load json data into bigquery.

    Args:
      scan_type: one of 'echo', 'discard', 'http', 'https'
      incremental_load: boolean. If true, only load the latest new data, if
        false reload all data.
      job_name: string name for this pipeline job.
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read.
        Mostly only used during development.
      end_date: date object, only files at or before this date will be read.
        Mostly only used during development.

    Raises:
      Exception: if any arguments are invalid or the pipeline fails.
    """
        logging.getLogger().setLevel(logging.INFO)
        pipeline_options = self._get_pipeline_options(scan_type, job_name)
        gcs = GCSFileSystem(pipeline_options)

        new_filenames = self._data_to_load(gcs, scan_type, incremental_load,
                                           table_name, start_date, end_date)
        if not new_filenames:
            logging.info('No new files to load incrementally')
            return

        with beam.Pipeline(options=pipeline_options) as p:
            # PCollection[Tuple[filename,line]]
            lines = _read_scan_text(p, new_filenames)

            # PCollection[Row]
            rows = (
                lines | 'flatten json' >>
                beam.FlatMapTuple(_flatten_measurement).with_output_types(Row))

            # PCollection[Row]
            rows_with_metadata = self._add_metadata(rows)

            self._write_to_bigquery(rows_with_metadata, table_name,
                                    incremental_load)
Exemple #12
0
def flatmap_tuple(test=None):
    # [START flatmap_tuple]
    import apache_beam as beam

    def format_plant(icon, plant):
        if icon:
            yield '{}{}'.format(icon, plant)

    with beam.Pipeline() as pipeline:
        plants = (pipeline
                  | 'Gardening plants' >> beam.Create([
                      ('🍓', 'Strawberry'),
                      ('🥕', 'Carrot'),
                      ('🍆', 'Eggplant'),
                      ('🍅', 'Tomato'),
                      ('🥔', 'Potato'),
                      (None, 'Invalid'),
                  ])
                  | 'Format' >> beam.FlatMapTuple(format_plant)
                  | beam.Map(print))
        # [END flatmap_tuple]
        if test:
            test(plants)
Exemple #13
0
        def CheckAggregation(inputs_and_expected, aggregation):
            # Split the test stream into a branch of to-be-processed elements, and
            # a branch of expected results.
            inputs, expected = (
                inputs_and_expected
                | beam.FlatMapTuple(lambda tag, value: [
                    beam.pvalue.TaggedOutput(tag, ('key1', value)),
                    beam.pvalue.TaggedOutput(tag, ('key2', value)),
                ]).with_outputs('input', 'expect'))

            # Process the inputs with the given windowing to produce actual outputs.
            outputs = (
                inputs
                | beam.MapTuple(lambda key, value: TimestampedValue(
                    (key, value), value))
                | beam.WindowInto(window_fn,
                                  trigger=trigger_fn,
                                  accumulation_mode=accumulation_mode,
                                  timestamp_combiner=timestamp_combiner)
                | aggregation
                | beam.MapTuple(_windowed_value_info_map_fn)
                # Place outputs back into the global window to allow flattening
                # and share a single state in Check.
                | 'Global' >> beam.WindowInto(
                    beam.transforms.window.GlobalWindows()))
            # Feed both the expected and actual outputs to Check() for comparison.
            tagged_expected = (
                expected
                | beam.MapTuple(lambda key, value: (key, ('expect', value))))
            tagged_outputs = (
                outputs
                | beam.MapTuple(lambda key, value: (key, ('actual', value))))
            # pylint: disable=expression-not-assigned
            ([tagged_expected, tagged_outputs]
             | beam.Flatten()
             | beam.ParDo(Check(self.allow_out_of_order)))
def process_satellite_with_tags(
    row_lines: beam.pvalue.PCollection[Tuple[str, str]],
    tag_lines: beam.pvalue.PCollection[Tuple[str, str]]
) -> beam.pvalue.PCollection[Row]:
    """Process Satellite measurements and tags.

  Args:
    row_lines: Row objects
    tag_lines: various

  Returns:
    PCollection[Row] of rows with tag metadata added
  """
    # PCollection[Row]
    rows = (row_lines | 'flatten json' >> beam.ParDo(
        flatten.FlattenMeasurement()).with_output_types(Row))
    # PCollection[Row]
    tag_rows = (tag_lines | 'tag rows' >>
                beam.FlatMapTuple(_read_satellite_tags).with_output_types(Row))

    # PCollection[Row]
    rows_with_metadata = _add_satellite_tags(rows, tag_rows)

    return rows_with_metadata
def transform_data(train_data_file, test_data_file, working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data using the CSV reader, and transform it using a
  preprocessing pipeline that scales numeric data and converts categorical data
  from strings to int64 values indices, by creating a vocabulary for each
  category.

  Args:
    train_data_file: File containing training data
    test_data_file: File containing test data
    working_dir: Directory to write transformed data and metadata to
  """
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        # Since we are modifying some features and leaving others unchanged, we
        # start by setting `outputs` to a copy of `inputs.
        outputs = inputs.copy()

        # Scale numeric columns to have range [0, 1].
        for key in NUMERIC_FEATURE_KEYS:
            outputs[key] = tft.scale_to_0_1(inputs[key])

        for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
            # This is a SparseTensor because it is optional. Here we fill in a default
            # value when it is missing.
            sparse = tf.sparse.SparseTensor(inputs[key].indices,
                                            inputs[key].values,
                                            [inputs[key].dense_shape[0], 1])
            dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
            # Reshaping from a batch of vectors of size 1 to a batch to scalars.
            dense = tf.squeeze(dense, axis=1)
            outputs[key] = tft.scale_to_0_1(dense)

        # For all categorical columns except the label column, we generate a
        # vocabulary, and convert the string feature to a one-hot encoding.
        for key in CATEGORICAL_FEATURE_KEYS:
            integerized = tft.compute_and_apply_vocabulary(
                tf.strings.strip(inputs[key]),
                num_oov_buckets=NUM_OOV_BUCKETS,
                vocab_filename=key)
            depth = (tft.experimental.get_vocabulary_size_by_name(key) +
                     NUM_OOV_BUCKETS)
            one_hot_encoded = tf.one_hot(integerized,
                                         depth=tf.cast(depth, tf.int32),
                                         on_value=1.0,
                                         off_value=0.0)
            # This output is now one-hot encoded. If saving transformed data to disk,
            # this can incur significant memory cost.
            outputs[key] = tf.reshape(one_hot_encoded, [-1, depth])

        # For the label column we provide the mapping from string to index.
        table_keys = ['>50K', '<=50K']
        with tf.init_scope():
            initializer = tf.lookup.KeyValueTensorInitializer(
                keys=table_keys,
                values=tf.cast(tf.range(len(table_keys)), tf.int64),
                key_dtype=tf.string,
                value_dtype=tf.int64)
            table = tf.lookup.StaticHashTable(initializer, default_value=-1)
        # Remove trailing periods for test data when the data is read with tf.data.
        label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '')
        label_str = tf.strings.strip(label_str)
        data_labels = table.lookup(label_str)
        transformed_label = tf.one_hot(indices=data_labels,
                                       depth=len(table_keys),
                                       on_value=1.0,
                                       off_value=0.0)
        outputs[LABEL_KEY] = tf.reshape(transformed_label,
                                        [-1, len(table_keys)])

        return outputs

    # The "with" block will create a pipeline, and run that pipeline at the exit
    # of the block.
    with beam.Pipeline() as pipeline:
        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Create a TFXIO to read the census data with the schema. To do this we
            # need to list all columns in order since the schema doesn't specify the
            # order of columns in the csv.
            # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource()
            # accepts a PCollection[bytes] because we need to patch the records first
            # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used
            # to both read the CSV files and parse them to TFT inputs:
            # csv_tfxio = tfxio.CsvTFXIO(...)
            # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource())
            csv_tfxio = tfxio.BeamRecordCsvTFXIO(
                physical_format='text',
                column_names=ORDERED_CSV_COLUMNS,
                schema=_SCHEMA)

            # Read in raw data and convert using CSV TFXIO.  Note that we apply
            # some Beam transformations here, which will not be encoded in the TF
            # graph since we don't do the from within tf.Transform's methods
            # (AnalyzeDataset, TransformDataset etc.).  These transformations are just
            # to get data into a format that the CSV TFXIO can read, in particular
            # removing spaces after commas.
            raw_data = (pipeline
                        | 'ReadTrainData' >> beam.io.ReadFromText(
                            train_data_file, coder=beam.coders.BytesCoder())
                        | 'FixCommasTrainData' >>
                        beam.Map(lambda line: line.replace(b', ', b','))
                        | 'DecodeTrainData' >> csv_tfxio.BeamSource())

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig())

            # The TFXIO output format is chosen for improved performance.
            transformed_dataset, transform_fn = (
                raw_dataset | tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn, output_record_batches=True))

            # Transformed metadata is not necessary for encoding.
            transformed_data, _ = transformed_dataset

            # Extract transformed RecordBatches, encode and write them to the given
            # directory.
            coder = RecordBatchToExamplesEncoder()
            _ = (transformed_data
                 | 'EncodeTrainData' >>
                 beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            # Now apply transform function to test data.  In this case we remove the
            # trailing period at the end of each line, and also ignore the header line
            # that is present in the test data file.
            raw_test_data = (pipeline
                             | 'ReadTestData' >> beam.io.ReadFromText(
                                 test_data_file,
                                 skip_header_lines=1,
                                 coder=beam.coders.BytesCoder())
                             | 'FixCommasTestData' >>
                             beam.Map(lambda line: line.replace(b', ', b','))
                             | 'RemoveTrailingPeriodsTestData' >>
                             beam.Map(lambda line: line[:-1])
                             | 'DecodeTestData' >> csv_tfxio.BeamSource())

            raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig())

            # The TFXIO output format is chosen for improved performance.
            transformed_test_dataset = (
                (raw_test_dataset, transform_fn)
                | tft_beam.TransformDataset(output_record_batches=True))

            # Transformed metadata is not necessary for encoding.
            transformed_test_data, _ = transformed_test_dataset

            # Extract transformed RecordBatches, encode and write them to the given
            # directory.
            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to working_dir, which can then
            # be read by the tft.TFTransformOutput class.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemple #16
0
def run(argv=None, save_main_session=True):
    """
    Main entry point; defines and runs the kuill pipeline.
    """
    parser = argparse.ArgumentParser()

    sp = parser.add_subparsers()
    #
    get_characters_parser = sp.add_parser("get_characters")
    get_characters_parser.add_argument("--output", default="characters.csv")
    get_characters_parser.set_defaults(command=get_characters_command)

    get_species_parser = sp.add_parser("get_species")
    get_species_parser.add_argument("--output", default="species.csv")
    get_species_parser.set_defaults(command=get_species_command)

    pipeline_parser = sp.add_parser("pipeline")

    pipeline_parser.add_argument(
        '--characters', type=str,
        required=True,
        help='Path to an input file.')

    pipeline_parser.add_argument(
        '--species', type=str,
        required=True,
        help='Path to an input file.')

    pipeline_parser.add_argument(
        '--output', type=str,
        required=True,
        help='Path to the output file(s).')

    pipeline_parser.set_defaults(command=None)

    args, pipeline_args = parser.parse_known_args(argv)

    if args.command:
        # handle get_data_command
        args.command(args)
        return

    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = save_main_session

    def join_data(k, v):
        logging.debug("join_data: %s", (k, v))
        import itertools
        return itertools.product(v['characters'], v['species'])

    def merge_data(row):
        logging.debug("merge_data: %s", row)
        character_data, species_data = row
        character_data.update(species_data)
        del character_data['species_id']
        return character_data

    def by_all_appearances(row):
        logging.debug("by_appearances: %s", row)
        return (row["appearances"], row)

    def appearance_key_for_element(element):
        """
        element is a dictionary of character data
        return the value of the "appearances" key
        """
        return element["appearances"]

    def height_key_for_element(element):
        """
        element is a dictionary of character data
        return the value of the "appearances" key
        """
        return element["height"]

    def format_csv(data):
        lines = ["{height},{appearances},{name},{species}".format(
            **row) for row in data]
        return "\n".join(lines)

    def resplit_data(data):
        """
        data is a list with one element, a list of rows?
        """
        logging.debug("resplit_data: %s", data)
        for row in data:
            yield row

    with beam.Pipeline(options=options) as p:
        char_inputs = (
            p
            | 'ReadCharInputText' >> beam.io.ReadFromText(
                args.characters, skip_header_lines=1)
        )

        spec_inputs = (
            p
            | 'ReadSpecInputText' >> beam.io.ReadFromText(
                args.species, skip_header_lines=1)
        )

        characters = (
            char_inputs
            | "parse chars" >> beam.ParDo(ParseCharacterFn())
            | "key_char" >> beam.Map(
                lambda c: (c["species_id"], c))
        )

        species = (
            spec_inputs
            | beam.ParDo(ParseSpeciesFn())
            | "key_spec" >> beam.Map(
                lambda s: (s["species_id"], s))
        )

        joined = (
            {"characters": characters,
                "species": species}
            | beam.CoGroupByKey()
            | beam.FlatMapTuple(join_data)
        )

        merged = (
            joined
            | beam.Map(merge_data)
        )

        top = (
            merged
            | "top by appearances" >> beam.combiners.Top.Of(
                10, key=appearance_key_for_element)
            | "re-split for height" >> beam.FlatMap(resplit_data)
            | "top by height" >> beam.combiners.Top.Of(
                10, key=height_key_for_element)
        )

        output = (  # noqa
            top
            | "format_csv" >> beam.Map(format_csv)
            | 'WriteCharacterData' >> beam.io.WriteToText(args.output)
        )
Exemple #17
0
 def expand(self, pcoll):
   return pcoll | beam.FlatMapTuple(self._split_chunks)
Exemple #18
0
 def expand(self, pcoll):
   return (
       pcoll
       | beam.Create(list(self.pattern.items()))
       | beam.FlatMapTuple(self._open_chunks)
   )
Exemple #19
0
    def expand(self,
               lifts: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
        """Takes top k and bottom k x values (sorted by lift) per slice and y value.

    Args:
      lifts: A PCollection of tuples of the form: (
        _SlicedFeatureKey(slice_key, x_path),
        _LiftInfo(x, y, lift, xy_count, x_count, y_count)).

    Returns:
      A PCollection resulting from a group by with the keys of the form
      (slice_key, x_path) and a stream of values of the form
      (y, y_count, [(x, lift, xy_count, x_count)], in which the stream of values
      has been limited to the top k and bottom k elements per key.
    """
        def move_y_info_to_key(key, value):
            slice_key, x_path = key
            return (_LiftSeriesKey(slice_key=slice_key,
                                   x_path=x_path,
                                   y=value.y,
                                   y_count=value.y_count),
                    _LiftValue(x=value.x,
                               lift=value.lift,
                               xy_count=value.xy_count,
                               x_count=value.x_count))

        # Push y_* into key so that we get per-slice, per-x-path, per-y top and
        # bottom k when calling {Largest,Smallest}PerKey.
        # (_LiftSequenceKey(slice, x_path, y, y_count),
        #      _LiftValue(x, lift, xy_count, x_count))
        lifts = lifts | 'MoveYToKey' >> beam.MapTuple(move_y_info_to_key)

        top_key = operator.attrgetter('lift', 'x')
        if self._top_k_per_y:
            # (_LiftSequenceKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            top_k = (lifts
                     | 'TopK' >> beam.transforms.combiners.Top.PerKey(
                         n=self._top_k_per_y, key=top_key))
        if self._bottom_k_per_y:
            # (_LiftSequenceKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            bottom_k = (lifts
                        | 'BottomK' >> beam.transforms.combiners.Top.PerKey(
                            n=self._bottom_k_per_y, reverse=True, key=top_key))

        if self._top_k_per_y and self._bottom_k_per_y:
            # (_LiftSeriesKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            grouped_lifts = (
                (top_k, bottom_k)
                | 'MergeTopAndBottom' >> beam.Flatten()
                | 'FlattenTopAndBottomLifts' >>
                beam.FlatMapTuple(lambda k, vs: ((k, v) for v in vs))
                | 'ReGroupTopAndBottom' >> beam.GroupByKey())
        elif self._top_k_per_y:
            grouped_lifts = top_k
        elif self._bottom_k_per_y:
            grouped_lifts = bottom_k
        else:
            grouped_lifts = lifts | 'GroupByYs' >> beam.GroupByKey()

        def move_y_info_to_value(key, lift_values):
            return (_SlicedFeatureKey(key.slice_key, key.x_path),
                    _LiftSeries(y=key.y,
                                y_count=key.y_count,
                                lift_values=lift_values))

        # (_SlicedFeatureKey(slice, x_path),
        #      _LiftSeries(y, y_count, [_LiftValue(x, lift, xy_count, x_count)]))
        return (grouped_lifts
                | 'MoveYInfoToValue' >> beam.MapTuple(move_y_info_to_value))
Exemple #20
0
#pipeline2.py: Separate subject with grade from a PCollection
import apache_beam as beam


def my_format(sub, marks):
    yield '{}\t{}'.format(sub, marks)


with beam.Pipeline() as pipeline:
    plants = (pipeline
              | 'Subjects' >> beam.Create([
                  ('English', 'A'),
                  ('Maths', 'B+'),
                  ('Science', 'A-'),
                  ('French', 'A'),
                  ('Arts', 'A+'),
              ])
              | 'Format subjects with marks' >> beam.FlatMapTuple(my_format)
              | beam.Map(print))
def post_processing_satellite(
        rows: beam.pvalue.PCollection[Row]) -> beam.pvalue.PCollection[Row]:
    """Run post processing on Satellite v1 data (calculate confidence, verify interference).

    Args:
      rows: PCollection of measurement rows

    Returns:
      PCollection of measurement rows with confidence and verify fields
  """
    def _total_tags(key: Tuple[str, str],
                    row: Row) -> Tuple[Tuple[str, str], int]:
        total_tags = 0
        for tag_type in flatten_satellite.SATELLITE_TAGS:
            if tag_type != 'ip':
                type_tags = {
                    ans[tag_type]
                    for ans in row['received'] if ans.get(tag_type)
                }
                total_tags += len(type_tags)
        return (key, total_tags)

    def _flat_rows_controls(key: Any, value: Row) -> Iterator[Tuple[Row, int]]:  # pylint: disable=unused-argument
        num_control_tags = 0
        if len(value['control']) > 0:
            num_control_tags = value['control'][0]
        for row in value['test']:
            yield (row, num_control_tags)

    # Partition rows into test measurements and control measurements
    # 'anomaly' is None for control measurements

    # PCollection[Tuple[Tuple[str, str], Row]], PCollection[Tuple[Tuple[str, str], Row]]
    rows, controls = (rows | 'key by dates and domains' >>
                      beam.Map(lambda row: ((row['date'], row['domain']), row))
                      | 'partition test and control' >> beam.Partition(
                          lambda row, p: int(row[1]['anomaly'] is None), 2))

    # PCollection[Tuple[Tuple[str, str], int]]
    num_ctags = controls | 'calculate # control tags' >> beam.MapTuple(
        _total_tags)

    # PCollection[Row]
    post = ({
        'test': rows,
        'control': num_ctags
    } | 'group rows and # control tags by keys' >> beam.CoGroupByKey()
            | 'flatmap to (row, # control tags)' >>
            beam.FlatMapTuple(_flat_rows_controls)
            | 'calculate confidence' >> beam.MapTuple(_calculate_confidence) |
            'verify interference' >> beam.Map(_verify).with_output_types(Row))

    # PCollection[Row]
    # pylint: disable=no-value-for-parameter
    controls = (controls
                | 'unkey control' >> beam.Values().with_output_types(Row))

    # PCollection[Row]
    post = ((post, controls) | 'flatten test and control' >> beam.Flatten())

    return post
Exemple #22
0
    def run(self):
        """Returns a PCollection of audit errors aggregated from all models.

        Returns:
            PCollection. A PCollection of audit errors discovered during the
            audit.

        Raises:
            ValueError. When the `datastoreio` option, which provides the
                PTransforms for performing datastore IO operations, is None.
        """
        existing_models, deleted_models = (
            self.pipeline
            | 'Get all models' >> ndb_io.GetModels(
                datastore_services.query_everything(), self.datastoreio_stub)
            | 'Partition by model.deleted' >> (
                beam.Partition(lambda model, _: int(model.deleted), 2))
        )

        models_of_kind_by_index = (
            existing_models
            # NOTE: Partition returns a statically-sized list of PCollections.
            # Creating partitions is wasteful when there are fewer items than
            # there are partitions, like in our unit tests. In exchange, in
            # production the job will be able to take advantage of the high
            # parallelizability of PCollections, which are designed for enormous
            # datasets and parallel processing.
            #
            # Alternatively, we could have used GroupBy. However, that returns
            # an _iterable_ of items rather than a PCollection, and so it is
            # vulnerable to out-of-memory errors.
            #
            # Since this job is concerned with running audits on EVERY MODEL IN
            # STORAGE, Partition is the clear winner regardless of the overhead
            # we'll see in unit tests.
            | 'Split models into parallelizable PCollections' >> beam.Partition(
                lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)),
                # NOTE: Partition requires a hard-coded number of slices; it
                # cannot be used with dynamic numbers generated in a pipeline.
                # KIND_BY_INDEX is a constant tuple so that requirement is
                # satisfied in this case.
                len(KIND_BY_INDEX), KIND_BY_INDEX)
        )

        existing_key_count_pcolls = []
        missing_key_error_pcolls = []
        audit_error_pcolls = [
            deleted_models
            | 'Apply ValidateDeletedModel on deleted models' >> (
                beam.ParDo(base_validation.ValidateDeletedModel()))
        ]

        model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index)
        for kind, models_of_kind in model_groups:
            audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind))

            if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES:
                existing_key_count_pcolls.append(
                    models_of_kind | GetExistingModelKeyCounts(kind))

            if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR:
                missing_key_error_pcolls.extend(
                    models_of_kind | GetMissingModelKeyErrors(kind))

        existing_key_counts = (
            existing_key_count_pcolls
            | 'Flatten PCollections of existing key counts' >> beam.Flatten()
        )
        missing_key_errors = (
            missing_key_error_pcolls
            | 'Flatten PCollections of missing key errors' >> beam.Flatten()
        )
        audit_error_pcolls.append(
            (existing_key_counts, missing_key_errors)
            | 'Group counts and errors by key' >> beam.CoGroupByKey()
            | 'Filter keys without any errors' >> (
                beam.FlatMapTuple(self._get_model_relationship_errors))
        )

        return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
def transform_data(working_dir):
    """Transform the data and write out as a TFRecord of Example protos.

  Read in the data from the positive and negative examples on disk, and
  transform it using a preprocessing pipeline that removes punctuation,
  tokenizes and maps tokens to int64 values indices.

  Args:
    working_dir: Directory to read shuffled data from and write transformed data
        and metadata to.
  """

    with beam.Pipeline() as pipeline:
        with tft_beam.Context(
                temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
            tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'),
                                                     schema=SCHEMA)
            train_data = (pipeline |
                          'TFXIORead[Train]' >> tfxio_train_data.BeamSource())

            tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join(
                working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'),
                                                    schema=SCHEMA)
            test_data = (pipeline
                         | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource())

            def preprocessing_fn(inputs):
                """Preprocess input columns into transformed columns."""
                review = inputs[REVIEW_KEY]

                # Here tf.compat.v1.string_split behaves differently from
                # tf.strings.split.
                review_tokens = tf.compat.v1.string_split(review, DELIMITERS)
                review_indices = tft.compute_and_apply_vocabulary(
                    review_tokens, top_k=VOCAB_SIZE)
                # Add one for the oov bucket created by compute_and_apply_vocabulary.
                review_bow_indices, review_weight = tft.tfidf(
                    review_indices, VOCAB_SIZE + 1)
                return {
                    REVIEW_KEY: review_bow_indices,
                    REVIEW_WEIGHT_KEY: review_weight,
                    LABEL_KEY: inputs[LABEL_KEY]
                }

            # Transformed metadata is not necessary for encoding.
            # The TFXIO output format is chosen for improved performance.
            (transformed_train_data, _), transform_fn = (
                (train_data, tfxio_train_data.TensorAdapterConfig())
                | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                    preprocessing_fn, output_record_batches=True))

            transformed_test_data, _ = (
                ((test_data, tfxio_test_data.TensorAdapterConfig()),
                 transform_fn)
                | 'Transform' >>
                tft_beam.TransformDataset(output_record_batches=True))

            # Extract transformed RecordBatches, encode and write them to the given
            # directory.
            coder = tfxio.RecordBatchToExamplesEncoder()
            _ = (transformed_train_data
                 | 'EncodeTrainData' >>
                 beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                 | 'WriteTrainData' >> beam.io.WriteToTFRecord(
                     os.path.join(working_dir,
                                  TRANSFORMED_TRAIN_DATA_FILEBASE)))

            _ = (
                transformed_test_data
                | 'EncodeTestData' >>
                beam.FlatMapTuple(lambda batch, _: coder.encode(batch))
                | 'WriteTestData' >> beam.io.WriteToTFRecord(
                    os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE)))

            # Will write a SavedModel and metadata to two subdirectories of
            # working_dir, given by tft.TRANSFORM_FN_DIR and
            # tft.TRANSFORMED_METADATA_DIR respectively.
            _ = (transform_fn
                 |
                 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
Exemple #24
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the skill migration.

        Returns:
            PCollection. A PCollection of results from the skill migration.
        """
        unmigrated_skill_models = (
            self.pipeline
            | 'Get all non-deleted skill models' >>
            (ndb_io.GetModels(skill_models.SkillModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill model ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_model: skill_model.id))
        skill_summary_models = (
            self.pipeline
            | 'Get all non-deleted skill summary models' >>
            (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill summary ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_summary_model: skill_summary_model.id))

        migrated_skill_results = (unmigrated_skill_models
                                  | 'Transform and migrate model' >>
                                  beam.MapTuple(self._migrate_skill))
        migrated_skills = (
            migrated_skill_results
            | 'Filter oks' >>
            beam.Filter(lambda result_item: result_item.is_ok())
            |
            'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap()))
        migrated_skill_job_run_results = (
            migrated_skill_results
            | 'Generate results for migration' >>
            (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED')))

        skill_changes = (unmigrated_skill_models
                         | 'Generate skill changes' >> beam.FlatMapTuple(
                             self._generate_skill_changes))

        skill_objects_list = (
            {
                'skill_model': unmigrated_skill_models,
                'skill_summary_model': skill_summary_models,
                'skill': migrated_skills,
                'skill_changes': skill_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated skills' >> beam.Filter(
                lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0)
            | 'Reorganize the skill objects' >> beam.Map(
                lambda objects: {
                    'skill_model': objects['skill_model'][0],
                    'skill_summary_model': objects['skill_summary_model'][0],
                    'skill': objects['skill'][0],
                    'skill_changes': objects['skill_changes']
                }))

        skill_objects_list_job_run_results = (
            skill_objects_list
            | 'Transform skill objects into job run results' >>
            (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED')
             ))

        cache_deletion_job_run_results = (
            skill_objects_list
            | 'Delete skill from cache' >>
            beam.Map(lambda skill_object: self._delete_skill_from_cache(
                skill_object['skill']))
            | 'Generate results for cache deletion' >>
            (job_result_transforms.ResultsToJobRunResults('CACHE DELETION')))

        skill_models_to_put = (
            skill_objects_list
            | 'Generate skill models to put' >>
            beam.FlatMap(lambda skill_objects: self._update_skill(
                skill_objects['skill_model'],
                skill_objects['skill'],
                skill_objects['skill_changes'],
            )))

        skill_summary_models_to_put = (
            skill_objects_list
            | 'Generate skill summary models to put' >>
            beam.Map(lambda skill_objects: self._update_skill_summary(
                skill_objects['skill'], skill_objects['skill_summary_model'])))

        unused_put_results = (
            (skill_models_to_put, skill_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (
            (cache_deletion_job_run_results, migrated_skill_job_run_results,
             skill_objects_list_job_run_results)
            | beam.Flatten())
Exemple #25
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the story migration.

        Returns:
            PCollection. A PCollection of results from the story migration.
        """

        unmigrated_story_models = (
            self.pipeline
            | 'Get all non-deleted story models' >> (
                ndb_io.GetModels(story_models.StoryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_model: story_model.id)
        )
        story_summary_models = (
            self.pipeline
            | 'Get all non-deleted story summary models' >> (
                ndb_io.GetModels(story_models.StorySummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_summary_model: story_summary_model.id)
        )
        topics = (
            self.pipeline
            | 'Get all non-deleted topic models' >> (
                ndb_io.GetModels(topic_models.TopicModel.get_all()))
            | 'Transform model into domain object' >> beam.Map(
                topic_fetchers.get_topic_from_model)
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda topic: topic.id)
        )
        topic_id_to_topic = beam.pvalue.AsDict(topics)

        migrated_story_results = (
            unmigrated_story_models
            | 'Transform and migrate model' >> beam.MapTuple(
                self._migrate_story, topic_id_to_topic=topic_id_to_topic)
        )
        migrated_stories = (
            migrated_story_results
            | 'Filter oks' >> beam.Filter(
                lambda result_item: result_item.is_ok())
            | 'Unwrap ok' >> beam.Map(
                lambda result_item: result_item.unwrap())
        )
        migrated_story_job_run_results = (
            migrated_story_results
            | 'Generate results for migration' >> (
                job_result_transforms.ResultsToJobRunResults('STORY PROCESSED'))
        )

        story_changes = (
            unmigrated_story_models
            | 'Generate story changes' >> beam.FlatMapTuple(
                self._generate_story_changes)
        )

        story_objects_list = (
            {
                'story_model': unmigrated_story_models,
                'story_summary_model': story_summary_models,
                'story': migrated_stories,
                'story_change': story_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated stories' >> beam.Filter(
                lambda x: len(x['story_change']) > 0 and len(x['story']) > 0)
            | 'Reorganize the story objects' >> beam.Map(lambda objects: {
                    'story_model': objects['story_model'][0],
                    'story_summary_model': objects['story_summary_model'][0],
                    'story': objects['story'][0],
                    'story_change': objects['story_change'][0]
                })
        )

        story_objects_list_job_run_results = (
            story_objects_list
            | 'Transform story objects into job run results' >> (
                job_result_transforms.CountObjectsToJobRunResult(
                    'STORY MIGRATED'))
        )

        cache_deletion_job_run_results = (
            story_objects_list
            | 'Delete story from cache' >> beam.Map(
                lambda story_objects: self._delete_story_from_cache(
                    story_objects['story']))
            | 'Generate results for cache deletion' >> (
                job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))
        )

        story_models_to_put = (
            story_objects_list
            | 'Generate story models to put' >> beam.FlatMap(
                lambda story_objects: self._update_story(
                    story_objects['story_model'],
                    story_objects['story'],
                    story_objects['story_change'],
                ))
        )

        story_summary_models_to_put = (
            story_objects_list
            | 'Generate story summary models to put' >> beam.Map(
                lambda story_objects: self._update_story_summary(
                    story_objects['story'],
                    story_objects['story_summary_model']
                ))
        )

        unused_put_results = (
            (story_models_to_put, story_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels()
        )

        return (
            (
                cache_deletion_job_run_results,
                migrated_story_job_run_results,
                story_objects_list_job_run_results
            )
            | beam.Flatten()
        )