def _add_metadata( self, rows: beam.pvalue.PCollection[Row] ) -> beam.pvalue.PCollection[Row]: """Add ip metadata to a collection of roundtrip rows. Args: rows: beam.PCollection[Row] Returns: PCollection[Row] The same rows as above with with additional metadata columns added. """ # PCollection[Tuple[DateIpKey,Row]] rows_keyed_by_ip_and_date = ( rows | 'key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key( row), row)).with_output_types(Tuple[DateIpKey, Row])) # PCollection[DateIpKey] # pylint: disable=no-value-for-parameter ips_and_dates = (rows_keyed_by_ip_and_date | 'get ip and date keys per row' >> beam.Keys().with_output_types(DateIpKey)) # PCollection[DateIpKey] deduped_ips_and_dates = ( # pylint: disable=no-value-for-parameter ips_and_dates | 'dedup' >> beam.Distinct().with_output_types(DateIpKey)) # PCollection[Tuple[date,List[ip]]] grouped_ips_by_dates = ( deduped_ips_and_dates | 'group by date' >> beam.GroupByKey().with_output_types(Tuple[str, Iterable[str]])) # PCollection[Tuple[DateIpKey,Row]] ips_with_metadata = (grouped_ips_by_dates | 'get ip metadata' >> beam.FlatMapTuple( self._add_ip_metadata).with_output_types( Tuple[DateIpKey, Row])) # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]] grouped_metadata_and_rows = ( ({ IP_METADATA_PCOLLECTION_NAME: ips_with_metadata, ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date }) | 'group by keys' >> beam.CoGroupByKey()) # PCollection[Row] rows_with_metadata = ( grouped_metadata_and_rows | 'merge metadata with rows' >> beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row)) return rows_with_metadata
def _add_vantage_point_tags( rows: beam.pvalue.PCollection[Row], ips_with_metadata: beam.pvalue.PCollection[Tuple[DateIpKey, Row]] ) -> beam.pvalue.PCollection[Row]: """Add tags for vantage point IPs - resolver name (hostname/control/special) and country Args: rows: PCollection of measurement rows ips_with_metadata: PCollection of dated ips with geo metadata Returns: PCollection of measurement rows with tag information added to the ip row """ # PCollection[Tuple[DateIpKey,Row]] rows_keyed_by_ip_and_date = ( rows | 'add vp tags: key by ips and dates' >> beam.Map(lambda row: (make_date_ip_key(row), row)).with_output_types( Tuple[DateIpKey, Row])) # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]] grouped_metadata_and_rows = ( ({ IP_METADATA_PCOLLECTION_NAME: ips_with_metadata, ROWS_PCOLLECION_NAME: rows_keyed_by_ip_and_date }) | 'add vp tags: group by keys' >> beam.CoGroupByKey()) # PCollection[Row] rows_with_metadata = ( grouped_metadata_and_rows | 'add vp tags: merge metadata with rows' >> beam.FlatMapTuple(merge_metadata_with_rows).with_output_types(Row)) return rows_with_metadata
def test_flat_map_tuple_wrapper(self): def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]: return [a, b, c] th = beam.FlatMapTuple(tuple_map_fn).get_type_hints() self.assertEqual(th.input_types, ((str, str, str), {})) self.assertEqual(th.output_types, ((str, ), {}))
def unflatten_rows( rows: beam.pvalue.PCollection[Row]) -> beam.pvalue.PCollection[Row]: """Unflatten so that each row contains a array of answer IPs Args: rows: measurement rows with a single recieved ip Returns: measurement rows aggregated so they have an array of recieved responses """ # PCollection[Tuple[str,Row]] keyed_by_measurement_id = ( rows | 'key by measurement id' >> beam.Map(lambda row: (row['measurement_id'], row)).with_output_types( Tuple[str, Row])) # PCollection[Tuple[str,Iterable[Row]]] grouped_by_measurement_id = ( keyed_by_measurement_id | 'group by measurement id' >> beam.GroupByKey()) # PCollection[Row] unflattened_rows = ( grouped_by_measurement_id | 'unflatten rows' >> beam.FlatMapTuple( lambda k, v: _unflatten_satellite(v)).with_output_types(Row)) return unflattened_rows
def test_flat_map_tuple_wrapper(self): # TODO(BEAM-8662): Also test with a fn that accepts default arguments. def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]: return [a, b, c] th = beam.FlatMapTuple(tuple_map_fn).get_type_hints() self.assertEqual(th.input_types, ((str, str, str), {})) self.assertEqual(th.output_types, ((str, ), {}))
def test_flat_map_tuple_wrapper(self): # TODO(https://github.com/apache/beam/issues/19961): Also test with a fn # that accepts default arguments. def tuple_map_fn(a: str, b: str, c: str) -> typehints.Iterable[str]: return [a, b, c] th = beam.FlatMapTuple(tuple_map_fn).get_type_hints() self.assertEqual(th.input_types, ((str, str, str), {})) self.assertEqual(th.output_types, ((str, ), {}))
def expand(self, pcoll): return ( pcoll | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target) | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks) # prevent undesirable fusion # https://stackoverflow.com/a/54131856/809705 | "Reshuffle" >> beam.Reshuffle() | "CopyChunks" >> beam.MapTuple(_copy_chunk) # prepare inputs for the next stage (if any) | "Finish" >> beam.Distinct())
def execute_pipeline(pipeline, options): messages = ( pipeline | "read messages" >> beam.io.ReadFromPubSub(topic=options.input_topic) | "parse to messages" >> beam.ParDo(ParseMessage())) sessions = (messages | "window" >> beam.WindowInto(beam.window.Sessions(15)) | "add key" >> beam.Map(lambda element: (element.user, element.products)) | "group by user" >> beam.GroupByKey() | "first flatten" >> beam.FlatMapTuple(flat_function) | "second flatten" >> beam.FlatMapTuple(flat_function) ) | "log 1" >> beam.ParDo(Log()) products = ( sessions | "add new key" >> beam.Map(lambda session: (session[1].id, (session[1], session[0]))) | "group by product" >> beam.GroupByKey()) | "log 2" >> beam.ParDo(Log())
def test_flat_map_tuple(self): def f(a, b, y=None): return a, b, y expected = [(1, 2), (3, 4)] | beam.FlatMapTuple(f, y=5) actual = [(1, 2), (3, 4)] | threadmap.FlatThreadMapTuple(f, y=5) self.assertEqual(expected, actual) actual = [(1, 2), (3, 4)] | threadmap.FlatThreadMapTuple( f, y=5, num_threads=None, ) self.assertEqual(expected, actual)
def add_received_ip_tags( rows: beam.pvalue.PCollection[Row], ips_with_metadata: beam.pvalue.PCollection[Tuple[DateIpKey, Row]] ) -> beam.pvalue.PCollection[Row]: """Add tags for answer ips (field received.ip) - asnum, asname, http, cert Args: rows: PCollection of measurement rows ips_with_metadata: PCollection of dated ips with geo metadata Returns: PCollection of measurement rows with tag information added to the recieved.ip row """ # PCollection[Tuple[DateIpKey,Row]] received_keyed_by_ip_and_date = ( rows | 'key by received ips and dates' >> beam.Map(lambda row: (_make_date_received_ip_key(row), row)).with_output_types( Tuple[DateIpKey, Row])) # Iterable[PCollection[Tuple[DateIpKey,Row]]] partition_by_domain = (received_keyed_by_ip_and_date | 'partition by domain' >> beam.Partition( _get_domain_partition, NUM_DOMAIN_PARTITIONS)) collections = [] for i in range(0, NUM_DOMAIN_PARTITIONS): elements = partition_by_domain[i] # PCollection[Tuple[Tuple[date,ip],Dict[input_name_key,List[Row]]]] grouped_received_metadata_and_rows = (({ IP_METADATA_PCOLLECTION_NAME: ips_with_metadata, ROWS_PCOLLECION_NAME: elements }) | f'group by received ip keys {i}' >> beam.CoGroupByKey()) # PCollection[Row] domain_rows_with_tags = ( grouped_received_metadata_and_rows | f'tag received ips {i}' >> beam.FlatMapTuple(lambda k, v: merge_metadata_with_rows( k, v, field='received')).with_output_types(Row)) collections.append(domain_rows_with_tags) # PCollection[Row] rows_with_tags = ( collections | 'merge domain collections' >> beam.Flatten().with_output_types(Row)) return rows_with_tags
def run_beam_pipeline(self, scan_type: str, incremental_load: bool, job_name: str, table_name: str, start_date: Optional[datetime.date], end_date: Optional[datetime.date]) -> None: """Run a single apache beam pipeline to load json data into bigquery. Args: scan_type: one of 'echo', 'discard', 'http', 'https' incremental_load: boolean. If true, only load the latest new data, if false reload all data. job_name: string name for this pipeline job. table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read. Mostly only used during development. end_date: date object, only files at or before this date will be read. Mostly only used during development. Raises: Exception: if any arguments are invalid or the pipeline fails. """ logging.getLogger().setLevel(logging.INFO) pipeline_options = self._get_pipeline_options(scan_type, job_name) gcs = GCSFileSystem(pipeline_options) new_filenames = self._data_to_load(gcs, scan_type, incremental_load, table_name, start_date, end_date) if not new_filenames: logging.info('No new files to load incrementally') return with beam.Pipeline(options=pipeline_options) as p: # PCollection[Tuple[filename,line]] lines = _read_scan_text(p, new_filenames) # PCollection[Row] rows = ( lines | 'flatten json' >> beam.FlatMapTuple(_flatten_measurement).with_output_types(Row)) # PCollection[Row] rows_with_metadata = self._add_metadata(rows) self._write_to_bigquery(rows_with_metadata, table_name, incremental_load)
def flatmap_tuple(test=None): # [START flatmap_tuple] import apache_beam as beam def format_plant(icon, plant): if icon: yield '{}{}'.format(icon, plant) with beam.Pipeline() as pipeline: plants = (pipeline | 'Gardening plants' >> beam.Create([ ('🍓', 'Strawberry'), ('🥕', 'Carrot'), ('🍆', 'Eggplant'), ('🍅', 'Tomato'), ('🥔', 'Potato'), (None, 'Invalid'), ]) | 'Format' >> beam.FlatMapTuple(format_plant) | beam.Map(print)) # [END flatmap_tuple] if test: test(plants)
def CheckAggregation(inputs_and_expected, aggregation): # Split the test stream into a branch of to-be-processed elements, and # a branch of expected results. inputs, expected = ( inputs_and_expected | beam.FlatMapTuple(lambda tag, value: [ beam.pvalue.TaggedOutput(tag, ('key1', value)), beam.pvalue.TaggedOutput(tag, ('key2', value)), ]).with_outputs('input', 'expect')) # Process the inputs with the given windowing to produce actual outputs. outputs = ( inputs | beam.MapTuple(lambda key, value: TimestampedValue( (key, value), value)) | beam.WindowInto(window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner) | aggregation | beam.MapTuple(_windowed_value_info_map_fn) # Place outputs back into the global window to allow flattening # and share a single state in Check. | 'Global' >> beam.WindowInto( beam.transforms.window.GlobalWindows())) # Feed both the expected and actual outputs to Check() for comparison. tagged_expected = ( expected | beam.MapTuple(lambda key, value: (key, ('expect', value)))) tagged_outputs = ( outputs | beam.MapTuple(lambda key, value: (key, ('actual', value)))) # pylint: disable=expression-not-assigned ([tagged_expected, tagged_outputs] | beam.Flatten() | beam.ParDo(Check(self.allow_out_of_order)))
def process_satellite_with_tags( row_lines: beam.pvalue.PCollection[Tuple[str, str]], tag_lines: beam.pvalue.PCollection[Tuple[str, str]] ) -> beam.pvalue.PCollection[Row]: """Process Satellite measurements and tags. Args: row_lines: Row objects tag_lines: various Returns: PCollection[Row] of rows with tag metadata added """ # PCollection[Row] rows = (row_lines | 'flatten json' >> beam.ParDo( flatten.FlattenMeasurement()).with_output_types(Row)) # PCollection[Row] tag_rows = (tag_lines | 'tag rows' >> beam.FlatMapTuple(_read_satellite_tags).with_output_types(Row)) # PCollection[Row] rows_with_metadata = _add_satellite_tags(rows, tag_rows) return rows_with_metadata
def transform_data(train_data_file, test_data_file, working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data using the CSV reader, and transform it using a preprocessing pipeline that scales numeric data and converts categorical data from strings to int64 values indices, by creating a vocabulary for each category. Args: train_data_file: File containing training data test_data_file: File containing test data working_dir: Directory to write transformed data and metadata to """ def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" # Since we are modifying some features and leaving others unchanged, we # start by setting `outputs` to a copy of `inputs. outputs = inputs.copy() # Scale numeric columns to have range [0, 1]. for key in NUMERIC_FEATURE_KEYS: outputs[key] = tft.scale_to_0_1(inputs[key]) for key in OPTIONAL_NUMERIC_FEATURE_KEYS: # This is a SparseTensor because it is optional. Here we fill in a default # value when it is missing. sparse = tf.sparse.SparseTensor(inputs[key].indices, inputs[key].values, [inputs[key].dense_shape[0], 1]) dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) # Reshaping from a batch of vectors of size 1 to a batch to scalars. dense = tf.squeeze(dense, axis=1) outputs[key] = tft.scale_to_0_1(dense) # For all categorical columns except the label column, we generate a # vocabulary, and convert the string feature to a one-hot encoding. for key in CATEGORICAL_FEATURE_KEYS: integerized = tft.compute_and_apply_vocabulary( tf.strings.strip(inputs[key]), num_oov_buckets=NUM_OOV_BUCKETS, vocab_filename=key) depth = (tft.experimental.get_vocabulary_size_by_name(key) + NUM_OOV_BUCKETS) one_hot_encoded = tf.one_hot(integerized, depth=tf.cast(depth, tf.int32), on_value=1.0, off_value=0.0) # This output is now one-hot encoded. If saving transformed data to disk, # this can incur significant memory cost. outputs[key] = tf.reshape(one_hot_encoded, [-1, depth]) # For the label column we provide the mapping from string to index. table_keys = ['>50K', '<=50K'] with tf.init_scope(): initializer = tf.lookup.KeyValueTensorInitializer( keys=table_keys, values=tf.cast(tf.range(len(table_keys)), tf.int64), key_dtype=tf.string, value_dtype=tf.int64) table = tf.lookup.StaticHashTable(initializer, default_value=-1) # Remove trailing periods for test data when the data is read with tf.data. label_str = tf.strings.regex_replace(inputs[LABEL_KEY], r'\.', '') label_str = tf.strings.strip(label_str) data_labels = table.lookup(label_str) transformed_label = tf.one_hot(indices=data_labels, depth=len(table_keys), on_value=1.0, off_value=0.0) outputs[LABEL_KEY] = tf.reshape(transformed_label, [-1, len(table_keys)]) return outputs # The "with" block will create a pipeline, and run that pipeline at the exit # of the block. with beam.Pipeline() as pipeline: with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Create a TFXIO to read the census data with the schema. To do this we # need to list all columns in order since the schema doesn't specify the # order of columns in the csv. # We first read CSV files and use BeamRecordCsvTFXIO whose .BeamSource() # accepts a PCollection[bytes] because we need to patch the records first # (see "FixCommasTrainData" below). Otherwise, tfxio.CsvTFXIO can be used # to both read the CSV files and parse them to TFT inputs: # csv_tfxio = tfxio.CsvTFXIO(...) # raw_data = (pipeline | 'ToRecordBatches' >> csv_tfxio.BeamSource()) csv_tfxio = tfxio.BeamRecordCsvTFXIO( physical_format='text', column_names=ORDERED_CSV_COLUMNS, schema=_SCHEMA) # Read in raw data and convert using CSV TFXIO. Note that we apply # some Beam transformations here, which will not be encoded in the TF # graph since we don't do the from within tf.Transform's methods # (AnalyzeDataset, TransformDataset etc.). These transformations are just # to get data into a format that the CSV TFXIO can read, in particular # removing spaces after commas. raw_data = (pipeline | 'ReadTrainData' >> beam.io.ReadFromText( train_data_file, coder=beam.coders.BytesCoder()) | 'FixCommasTrainData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'DecodeTrainData' >> csv_tfxio.BeamSource()) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. raw_dataset = (raw_data, csv_tfxio.TensorAdapterConfig()) # The TFXIO output format is chosen for improved performance. transformed_dataset, transform_fn = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=True)) # Transformed metadata is not necessary for encoding. transformed_data, _ = transformed_dataset # Extract transformed RecordBatches, encode and write them to the given # directory. coder = RecordBatchToExamplesEncoder() _ = (transformed_data | 'EncodeTrainData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) # Now apply transform function to test data. In this case we remove the # trailing period at the end of each line, and also ignore the header line # that is present in the test data file. raw_test_data = (pipeline | 'ReadTestData' >> beam.io.ReadFromText( test_data_file, skip_header_lines=1, coder=beam.coders.BytesCoder()) | 'FixCommasTestData' >> beam.Map(lambda line: line.replace(b', ', b',')) | 'RemoveTrailingPeriodsTestData' >> beam.Map(lambda line: line[:-1]) | 'DecodeTestData' >> csv_tfxio.BeamSource()) raw_test_dataset = (raw_test_data, csv_tfxio.TensorAdapterConfig()) # The TFXIO output format is chosen for improved performance. transformed_test_dataset = ( (raw_test_dataset, transform_fn) | tft_beam.TransformDataset(output_record_batches=True)) # Transformed metadata is not necessary for encoding. transformed_test_data, _ = transformed_test_dataset # Extract transformed RecordBatches, encode and write them to the given # directory. _ = ( transformed_test_data | 'EncodeTestData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to working_dir, which can then # be read by the tft.TFTransformOutput class. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def run(argv=None, save_main_session=True): """ Main entry point; defines and runs the kuill pipeline. """ parser = argparse.ArgumentParser() sp = parser.add_subparsers() # get_characters_parser = sp.add_parser("get_characters") get_characters_parser.add_argument("--output", default="characters.csv") get_characters_parser.set_defaults(command=get_characters_command) get_species_parser = sp.add_parser("get_species") get_species_parser.add_argument("--output", default="species.csv") get_species_parser.set_defaults(command=get_species_command) pipeline_parser = sp.add_parser("pipeline") pipeline_parser.add_argument( '--characters', type=str, required=True, help='Path to an input file.') pipeline_parser.add_argument( '--species', type=str, required=True, help='Path to an input file.') pipeline_parser.add_argument( '--output', type=str, required=True, help='Path to the output file(s).') pipeline_parser.set_defaults(command=None) args, pipeline_args = parser.parse_known_args(argv) if args.command: # handle get_data_command args.command(args) return options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = save_main_session def join_data(k, v): logging.debug("join_data: %s", (k, v)) import itertools return itertools.product(v['characters'], v['species']) def merge_data(row): logging.debug("merge_data: %s", row) character_data, species_data = row character_data.update(species_data) del character_data['species_id'] return character_data def by_all_appearances(row): logging.debug("by_appearances: %s", row) return (row["appearances"], row) def appearance_key_for_element(element): """ element is a dictionary of character data return the value of the "appearances" key """ return element["appearances"] def height_key_for_element(element): """ element is a dictionary of character data return the value of the "appearances" key """ return element["height"] def format_csv(data): lines = ["{height},{appearances},{name},{species}".format( **row) for row in data] return "\n".join(lines) def resplit_data(data): """ data is a list with one element, a list of rows? """ logging.debug("resplit_data: %s", data) for row in data: yield row with beam.Pipeline(options=options) as p: char_inputs = ( p | 'ReadCharInputText' >> beam.io.ReadFromText( args.characters, skip_header_lines=1) ) spec_inputs = ( p | 'ReadSpecInputText' >> beam.io.ReadFromText( args.species, skip_header_lines=1) ) characters = ( char_inputs | "parse chars" >> beam.ParDo(ParseCharacterFn()) | "key_char" >> beam.Map( lambda c: (c["species_id"], c)) ) species = ( spec_inputs | beam.ParDo(ParseSpeciesFn()) | "key_spec" >> beam.Map( lambda s: (s["species_id"], s)) ) joined = ( {"characters": characters, "species": species} | beam.CoGroupByKey() | beam.FlatMapTuple(join_data) ) merged = ( joined | beam.Map(merge_data) ) top = ( merged | "top by appearances" >> beam.combiners.Top.Of( 10, key=appearance_key_for_element) | "re-split for height" >> beam.FlatMap(resplit_data) | "top by height" >> beam.combiners.Top.Of( 10, key=height_key_for_element) ) output = ( # noqa top | "format_csv" >> beam.Map(format_csv) | 'WriteCharacterData' >> beam.io.WriteToText(args.output) )
def expand(self, pcoll): return pcoll | beam.FlatMapTuple(self._split_chunks)
def expand(self, pcoll): return ( pcoll | beam.Create(list(self.pattern.items())) | beam.FlatMapTuple(self._open_chunks) )
def expand(self, lifts: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Takes top k and bottom k x values (sorted by lift) per slice and y value. Args: lifts: A PCollection of tuples of the form: ( _SlicedFeatureKey(slice_key, x_path), _LiftInfo(x, y, lift, xy_count, x_count, y_count)). Returns: A PCollection resulting from a group by with the keys of the form (slice_key, x_path) and a stream of values of the form (y, y_count, [(x, lift, xy_count, x_count)], in which the stream of values has been limited to the top k and bottom k elements per key. """ def move_y_info_to_key(key, value): slice_key, x_path = key return (_LiftSeriesKey(slice_key=slice_key, x_path=x_path, y=value.y, y_count=value.y_count), _LiftValue(x=value.x, lift=value.lift, xy_count=value.xy_count, x_count=value.x_count)) # Push y_* into key so that we get per-slice, per-x-path, per-y top and # bottom k when calling {Largest,Smallest}PerKey. # (_LiftSequenceKey(slice, x_path, y, y_count), # _LiftValue(x, lift, xy_count, x_count)) lifts = lifts | 'MoveYToKey' >> beam.MapTuple(move_y_info_to_key) top_key = operator.attrgetter('lift', 'x') if self._top_k_per_y: # (_LiftSequenceKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) top_k = (lifts | 'TopK' >> beam.transforms.combiners.Top.PerKey( n=self._top_k_per_y, key=top_key)) if self._bottom_k_per_y: # (_LiftSequenceKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) bottom_k = (lifts | 'BottomK' >> beam.transforms.combiners.Top.PerKey( n=self._bottom_k_per_y, reverse=True, key=top_key)) if self._top_k_per_y and self._bottom_k_per_y: # (_LiftSeriesKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) grouped_lifts = ( (top_k, bottom_k) | 'MergeTopAndBottom' >> beam.Flatten() | 'FlattenTopAndBottomLifts' >> beam.FlatMapTuple(lambda k, vs: ((k, v) for v in vs)) | 'ReGroupTopAndBottom' >> beam.GroupByKey()) elif self._top_k_per_y: grouped_lifts = top_k elif self._bottom_k_per_y: grouped_lifts = bottom_k else: grouped_lifts = lifts | 'GroupByYs' >> beam.GroupByKey() def move_y_info_to_value(key, lift_values): return (_SlicedFeatureKey(key.slice_key, key.x_path), _LiftSeries(y=key.y, y_count=key.y_count, lift_values=lift_values)) # (_SlicedFeatureKey(slice, x_path), # _LiftSeries(y, y_count, [_LiftValue(x, lift, xy_count, x_count)])) return (grouped_lifts | 'MoveYInfoToValue' >> beam.MapTuple(move_y_info_to_value))
#pipeline2.py: Separate subject with grade from a PCollection import apache_beam as beam def my_format(sub, marks): yield '{}\t{}'.format(sub, marks) with beam.Pipeline() as pipeline: plants = (pipeline | 'Subjects' >> beam.Create([ ('English', 'A'), ('Maths', 'B+'), ('Science', 'A-'), ('French', 'A'), ('Arts', 'A+'), ]) | 'Format subjects with marks' >> beam.FlatMapTuple(my_format) | beam.Map(print))
def post_processing_satellite( rows: beam.pvalue.PCollection[Row]) -> beam.pvalue.PCollection[Row]: """Run post processing on Satellite v1 data (calculate confidence, verify interference). Args: rows: PCollection of measurement rows Returns: PCollection of measurement rows with confidence and verify fields """ def _total_tags(key: Tuple[str, str], row: Row) -> Tuple[Tuple[str, str], int]: total_tags = 0 for tag_type in flatten_satellite.SATELLITE_TAGS: if tag_type != 'ip': type_tags = { ans[tag_type] for ans in row['received'] if ans.get(tag_type) } total_tags += len(type_tags) return (key, total_tags) def _flat_rows_controls(key: Any, value: Row) -> Iterator[Tuple[Row, int]]: # pylint: disable=unused-argument num_control_tags = 0 if len(value['control']) > 0: num_control_tags = value['control'][0] for row in value['test']: yield (row, num_control_tags) # Partition rows into test measurements and control measurements # 'anomaly' is None for control measurements # PCollection[Tuple[Tuple[str, str], Row]], PCollection[Tuple[Tuple[str, str], Row]] rows, controls = (rows | 'key by dates and domains' >> beam.Map(lambda row: ((row['date'], row['domain']), row)) | 'partition test and control' >> beam.Partition( lambda row, p: int(row[1]['anomaly'] is None), 2)) # PCollection[Tuple[Tuple[str, str], int]] num_ctags = controls | 'calculate # control tags' >> beam.MapTuple( _total_tags) # PCollection[Row] post = ({ 'test': rows, 'control': num_ctags } | 'group rows and # control tags by keys' >> beam.CoGroupByKey() | 'flatmap to (row, # control tags)' >> beam.FlatMapTuple(_flat_rows_controls) | 'calculate confidence' >> beam.MapTuple(_calculate_confidence) | 'verify interference' >> beam.Map(_verify).with_output_types(Row)) # PCollection[Row] # pylint: disable=no-value-for-parameter controls = (controls | 'unkey control' >> beam.Values().with_output_types(Row)) # PCollection[Row] post = ((post, controls) | 'flatten test and control' >> beam.Flatten()) return post
def run(self): """Returns a PCollection of audit errors aggregated from all models. Returns: PCollection. A PCollection of audit errors discovered during the audit. Raises: ValueError. When the `datastoreio` option, which provides the PTransforms for performing datastore IO operations, is None. """ existing_models, deleted_models = ( self.pipeline | 'Get all models' >> ndb_io.GetModels( datastore_services.query_everything(), self.datastoreio_stub) | 'Partition by model.deleted' >> ( beam.Partition(lambda model, _: int(model.deleted), 2)) ) models_of_kind_by_index = ( existing_models # NOTE: Partition returns a statically-sized list of PCollections. # Creating partitions is wasteful when there are fewer items than # there are partitions, like in our unit tests. In exchange, in # production the job will be able to take advantage of the high # parallelizability of PCollections, which are designed for enormous # datasets and parallel processing. # # Alternatively, we could have used GroupBy. However, that returns # an _iterable_ of items rather than a PCollection, and so it is # vulnerable to out-of-memory errors. # # Since this job is concerned with running audits on EVERY MODEL IN # STORAGE, Partition is the clear winner regardless of the overhead # we'll see in unit tests. | 'Split models into parallelizable PCollections' >> beam.Partition( lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)), # NOTE: Partition requires a hard-coded number of slices; it # cannot be used with dynamic numbers generated in a pipeline. # KIND_BY_INDEX is a constant tuple so that requirement is # satisfied in this case. len(KIND_BY_INDEX), KIND_BY_INDEX) ) existing_key_count_pcolls = [] missing_key_error_pcolls = [] audit_error_pcolls = [ deleted_models | 'Apply ValidateDeletedModel on deleted models' >> ( beam.ParDo(base_validation.ValidateDeletedModel())) ] model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index) for kind, models_of_kind in model_groups: audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind)) if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES: existing_key_count_pcolls.append( models_of_kind | GetExistingModelKeyCounts(kind)) if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR: missing_key_error_pcolls.extend( models_of_kind | GetMissingModelKeyErrors(kind)) existing_key_counts = ( existing_key_count_pcolls | 'Flatten PCollections of existing key counts' >> beam.Flatten() ) missing_key_errors = ( missing_key_error_pcolls | 'Flatten PCollections of missing key errors' >> beam.Flatten() ) audit_error_pcolls.append( (existing_key_counts, missing_key_errors) | 'Group counts and errors by key' >> beam.CoGroupByKey() | 'Filter keys without any errors' >> ( beam.FlatMapTuple(self._get_model_relationship_errors)) ) return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
def transform_data(working_dir): """Transform the data and write out as a TFRecord of Example protos. Read in the data from the positive and negative examples on disk, and transform it using a preprocessing pipeline that removes punctuation, tokenizes and maps tokens to int64 values indices. Args: working_dir: Directory to read shuffled data from and write transformed data and metadata to. """ with beam.Pipeline() as pipeline: with tft_beam.Context( temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)): tfxio_train_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TRAIN_DATA_FILEBASE + '*'), schema=SCHEMA) train_data = (pipeline | 'TFXIORead[Train]' >> tfxio_train_data.BeamSource()) tfxio_test_data = tfxio.TFExampleRecord(file_pattern=os.path.join( working_dir, SHUFFLED_TEST_DATA_FILEBASE + '*'), schema=SCHEMA) test_data = (pipeline | 'TFXIORead[Test]' >> tfxio_test_data.BeamSource()) def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" review = inputs[REVIEW_KEY] # Here tf.compat.v1.string_split behaves differently from # tf.strings.split. review_tokens = tf.compat.v1.string_split(review, DELIMITERS) review_indices = tft.compute_and_apply_vocabulary( review_tokens, top_k=VOCAB_SIZE) # Add one for the oov bucket created by compute_and_apply_vocabulary. review_bow_indices, review_weight = tft.tfidf( review_indices, VOCAB_SIZE + 1) return { REVIEW_KEY: review_bow_indices, REVIEW_WEIGHT_KEY: review_weight, LABEL_KEY: inputs[LABEL_KEY] } # Transformed metadata is not necessary for encoding. # The TFXIO output format is chosen for improved performance. (transformed_train_data, _), transform_fn = ( (train_data, tfxio_train_data.TensorAdapterConfig()) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=True)) transformed_test_data, _ = ( ((test_data, tfxio_test_data.TensorAdapterConfig()), transform_fn) | 'Transform' >> tft_beam.TransformDataset(output_record_batches=True)) # Extract transformed RecordBatches, encode and write them to the given # directory. coder = tfxio.RecordBatchToExamplesEncoder() _ = (transformed_train_data | 'EncodeTrainData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTrainData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TRAIN_DATA_FILEBASE))) _ = ( transformed_test_data | 'EncodeTestData' >> beam.FlatMapTuple(lambda batch, _: coder.encode(batch)) | 'WriteTestData' >> beam.io.WriteToTFRecord( os.path.join(working_dir, TRANSFORMED_TEST_DATA_FILEBASE))) # Will write a SavedModel and metadata to two subdirectories of # working_dir, given by tft.TRANSFORM_FN_DIR and # tft.TRANSFORMED_METADATA_DIR respectively. _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the skill migration. Returns: PCollection. A PCollection of results from the skill migration. """ unmigrated_skill_models = ( self.pipeline | 'Get all non-deleted skill models' >> (ndb_io.GetModels(skill_models.SkillModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill model ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_model: skill_model.id)) skill_summary_models = ( self.pipeline | 'Get all non-deleted skill summary models' >> (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill summary ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_summary_model: skill_summary_model.id)) migrated_skill_results = (unmigrated_skill_models | 'Transform and migrate model' >> beam.MapTuple(self._migrate_skill)) migrated_skills = ( migrated_skill_results | 'Filter oks' >> beam.Filter(lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap())) migrated_skill_job_run_results = ( migrated_skill_results | 'Generate results for migration' >> (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED'))) skill_changes = (unmigrated_skill_models | 'Generate skill changes' >> beam.FlatMapTuple( self._generate_skill_changes)) skill_objects_list = ( { 'skill_model': unmigrated_skill_models, 'skill_summary_model': skill_summary_models, 'skill': migrated_skills, 'skill_changes': skill_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated skills' >> beam.Filter( lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0) | 'Reorganize the skill objects' >> beam.Map( lambda objects: { 'skill_model': objects['skill_model'][0], 'skill_summary_model': objects['skill_summary_model'][0], 'skill': objects['skill'][0], 'skill_changes': objects['skill_changes'] })) skill_objects_list_job_run_results = ( skill_objects_list | 'Transform skill objects into job run results' >> (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED') )) cache_deletion_job_run_results = ( skill_objects_list | 'Delete skill from cache' >> beam.Map(lambda skill_object: self._delete_skill_from_cache( skill_object['skill'])) | 'Generate results for cache deletion' >> (job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))) skill_models_to_put = ( skill_objects_list | 'Generate skill models to put' >> beam.FlatMap(lambda skill_objects: self._update_skill( skill_objects['skill_model'], skill_objects['skill'], skill_objects['skill_changes'], ))) skill_summary_models_to_put = ( skill_objects_list | 'Generate skill summary models to put' >> beam.Map(lambda skill_objects: self._update_skill_summary( skill_objects['skill'], skill_objects['skill_summary_model']))) unused_put_results = ( (skill_models_to_put, skill_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels()) return ( (cache_deletion_job_run_results, migrated_skill_job_run_results, skill_objects_list_job_run_results) | beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the story migration. Returns: PCollection. A PCollection of results from the story migration. """ unmigrated_story_models = ( self.pipeline | 'Get all non-deleted story models' >> ( ndb_io.GetModels(story_models.StoryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_model: story_model.id) ) story_summary_models = ( self.pipeline | 'Get all non-deleted story summary models' >> ( ndb_io.GetModels(story_models.StorySummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_summary_model: story_summary_model.id) ) topics = ( self.pipeline | 'Get all non-deleted topic models' >> ( ndb_io.GetModels(topic_models.TopicModel.get_all())) | 'Transform model into domain object' >> beam.Map( topic_fetchers.get_topic_from_model) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda topic: topic.id) ) topic_id_to_topic = beam.pvalue.AsDict(topics) migrated_story_results = ( unmigrated_story_models | 'Transform and migrate model' >> beam.MapTuple( self._migrate_story, topic_id_to_topic=topic_id_to_topic) ) migrated_stories = ( migrated_story_results | 'Filter oks' >> beam.Filter( lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map( lambda result_item: result_item.unwrap()) ) migrated_story_job_run_results = ( migrated_story_results | 'Generate results for migration' >> ( job_result_transforms.ResultsToJobRunResults('STORY PROCESSED')) ) story_changes = ( unmigrated_story_models | 'Generate story changes' >> beam.FlatMapTuple( self._generate_story_changes) ) story_objects_list = ( { 'story_model': unmigrated_story_models, 'story_summary_model': story_summary_models, 'story': migrated_stories, 'story_change': story_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated stories' >> beam.Filter( lambda x: len(x['story_change']) > 0 and len(x['story']) > 0) | 'Reorganize the story objects' >> beam.Map(lambda objects: { 'story_model': objects['story_model'][0], 'story_summary_model': objects['story_summary_model'][0], 'story': objects['story'][0], 'story_change': objects['story_change'][0] }) ) story_objects_list_job_run_results = ( story_objects_list | 'Transform story objects into job run results' >> ( job_result_transforms.CountObjectsToJobRunResult( 'STORY MIGRATED')) ) cache_deletion_job_run_results = ( story_objects_list | 'Delete story from cache' >> beam.Map( lambda story_objects: self._delete_story_from_cache( story_objects['story'])) | 'Generate results for cache deletion' >> ( job_result_transforms.ResultsToJobRunResults('CACHE DELETION')) ) story_models_to_put = ( story_objects_list | 'Generate story models to put' >> beam.FlatMap( lambda story_objects: self._update_story( story_objects['story_model'], story_objects['story'], story_objects['story_change'], )) ) story_summary_models_to_put = ( story_objects_list | 'Generate story summary models to put' >> beam.Map( lambda story_objects: self._update_story_summary( story_objects['story'], story_objects['story_summary_model'] )) ) unused_put_results = ( (story_models_to_put, story_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels() ) return ( ( cache_deletion_job_run_results, migrated_story_job_run_results, story_objects_list_job_run_results ) | beam.Flatten() )