def expand( self, results: beam.PCollection[job_run_result.JobRunResult] ) -> beam.pvalue.PDone: """Writes the given job results to the NDB datastore. This overrides expand from parent class. Args: results: PCollection. Models, can also contain just one model. Returns: PCollection. An empty PCollection. """ return ( results # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() # pylint: disable=no-value-for-parameter | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model, results.pipeline.options.namespace) | ndb_io.PutModels())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: collection_pairs = ( self.pipeline | 'get collection models ' >> ndb_io.GetModels( collection_models.CollectionRightsModel.get_all()) | 'Flatten owner_ids and format' >> beam.FlatMap( self._extract_user_and_collection_ids)) user_pairs = (self.pipeline | 'Get all user settings models' >> ndb_io.GetModels( user_models.UserSettingsModel.get_all()) | 'Extract id and email' >> beam.Map(lambda user_setting: (user_setting.id, user_setting.email))) collection_ids_to_email_mapping = ( (collection_pairs, user_pairs) | 'Group by user_id' >> beam.CoGroupByKey() | 'Drop user id' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Filter out results without any collection' >> beam.Filter(lambda collection_ids_and_email: len( collection_ids_and_email[0]) > 0)) return ( collection_ids_to_email_mapping | 'Get final result' >> beam.MapTuple(lambda collection, email: job_run_result.JobRunResult .as_stdout('collection_ids: %s, email: %s' % (collection, email))))
def expand( self, pcollections: Tuple[beam.pvalue.PCollection, beam.pvalue.PCollection] ) -> Tuple[beam.pvalue.PCollection, beam.pvalue.PCollection]: training_examples, serving_examples = pcollections keyed_training_examples = ( training_examples | "ExtractTrainingIdentifiers" >> beam.ParDo( _ExtractIdentifiers(self._identifier_features, self._float_round_ndigits))) keyed_serving_examples = ( serving_examples | "ExtractServingIdentifiers" >> beam.ParDo( _ExtractIdentifiers(self._identifier_features, self._float_round_ndigits))) results = ({ "training": keyed_training_examples, "serving": keyed_serving_examples } | "JoinExamples" >> beam.CoGroupByKey() | "ComputeSkew" >> beam.ParDo( _ComputeSkew( self._features_to_ignore, self._float_round_ndigits, self._allow_duplicate_identifiers)).with_outputs( "skew_results", "skew_pairs")) skew_results = ( results.skew_results | "MergeSkewResultsPerFeature" >> # pytype: disable=attribute-error beam.CombinePerKey(_merge_feature_skew_results) | "DropKeys" >> beam.Values()) skew_pairs = ( results.skew_pairs | "SampleSkewPairs" >> # pytype: disable=attribute-error beam.combiners.Sample.FixedSizeGlobally(self._sample_size) # Sampling results in a pcollection with a single element consisting of # a list of the samples. Convert this to a pcollection of samples. | "Flatten" >> beam.FlatMap(lambda x: x)) return skew_results, skew_pairs
def write_from_pcollection(self, pcoll_examples): import apache_beam as beam # create some metadata that will be used in .finalize() num_examples = ( pcoll_examples | "Add metadata key" >> beam.Map(lambda v: ("num_examples", v)) | "Count" >> beam.CombinePerKey( beam.transforms.combiners.CountCombineFn())) def save_metatada(metadata_items): with open(self._path + ".json", "w") as metadata_file: json.dump(metadata_items, metadata_file) # save metadata _ = ((num_examples, ) | "Merge pcollections" >> beam.Flatten() | "Create Dict" >> beam.transforms.combiners.ToDict() | "Save metadata" >> beam.ParDo(save_metatada)) # save dataset return ( pcoll_examples | "Get values" >> beam.Values() | "Save to parquet" >> beam.io.parquetio.WriteToParquet( self._path, self._schema, num_shards=1, shard_name_template="") )
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. """ topics = (self.pipeline | 'Get all non-deleted topic models' >> (ndb_io.GetModels( topic_models.TopicModel.get_all(include_deleted=False))) | 'Get topic from model' >> beam.Map( topic_fetchers.get_topic_from_model)) story_ids_to_story = ( self.pipeline | 'Get all non-deleted story models' >> ndb_io.GetModels( story_models.StoryModel.get_all(include_deleted=False)) | 'Get story from model' >> beam.Map( story_fetchers.get_story_from_model) | 'Combine stories and ids' >> beam.Map(lambda story: (story.id, story))) exp_ids_to_exp = ( self.pipeline | 'Get all non-deleted exp models' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all(include_deleted=False)) | 'Get exploration from model' >> beam.Map( exp_fetchers.get_exploration_from_model) | 'Combine exploration and ids' >> beam.Map(lambda exp: (exp.id, exp))) stories_dict = beam.pvalue.AsDict(story_ids_to_story) exps_dict = beam.pvalue.AsDict(exp_ids_to_exp) opportunities_results = ( topics | beam.Map(self._generate_opportunities_related_to_topic, stories_dict=stories_dict, exps_dict=exps_dict)) unused_put_result = ( opportunities_results | 'Filter the results with SUCCESS status' >> beam.Filter(lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.FlatMap(lambda result: result.unwrap()) | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id) # pylint: disable=no-value-for-parameter | 'Allow only one item per key' >> (beam.combiners.Sample.FixedSizePerKey(1)) | 'Remove the IDs' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x) | 'Put models into the datastore' >> ndb_io.PutModels()) return (opportunities_results | 'Count the output' >> (job_result_transforms.ResultsToJobRunResults()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: user_settings_models = ( self.pipeline | 'Get all UserSettingsModels' >> (ndb_io.GetModels(user_models.UserSettingsModel.get_all()))) old_user_stats_models = ( self.pipeline | 'Get all UserStatsModels' >> (ndb_io.GetModels(user_models.UserStatsModel.get_all()))) # Creates UserStatsModels if it does not exists. new_user_stats_models = ( (user_settings_models, old_user_stats_models) | 'Merge models' >> beam.Flatten() # Returns a PCollection of # (model.id, (user_settings_models, user_stats_models)) or # (model.id, (user_settings_models,)). | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id) # Discards model.id from the PCollection. | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter # Only keep groupings that indicate that # the UserStatsModel is missing. | 'Filter pairs of models' >> beam.Filter(lambda models: (len(list(models)) == 1 and isinstance( list(models)[0], user_models.UserSettingsModel))) # Choosing the first element. | 'Transform tuples into models' >> beam.Map(lambda models: list(models)[0]) # Creates the missing UserStatsModels. | 'Create new user stat models' >> beam.ParDo( CreateUserStatsModel())) unused_put_result = ( (new_user_stats_models, old_user_stats_models) | 'Merge new and old models together' >> beam.Flatten() | 'Update the dashboard stats' >> beam.ParDo( UpdateWeeklyCreatorStats()) | 'Put models into the datastore' >> ndb_io.PutModels()) new_user_stats_job_result = ( new_user_stats_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS NEW %s' % x))) old_user_stats_job_result = ( old_user_stats_models | 'Count all old models' >> beam.combiners.Count.Globally() | 'Only create result for old models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for old models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS OLD %s' % x))) return ((new_user_stats_job_result, old_user_stats_job_result) | 'Merge new and old results together' >> beam.Flatten())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) new_user_stats_models = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], list(x['opportunity'][0])[0] if len(x['opportunity']) else None)) | 'Combine the stats' >> beam.CombinePerKey(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) unused_put_result = ( new_user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (new_user_stats_models | 'Count all new models' >> (beam.combiners.Count.Globally().without_defaults()) | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)))
def find_duplicates_fn(inputs: _PCollection) -> _DoOutputsTuple: return (inputs \ | 'WithSymbolAndDataTypeAsKey' >> beam.Map(lambda d: ((d.symbol, d.data_type), d)) \ | 'GroupByKey' >> beam.GroupByKey() \ | 'Values' >> beam.Values() \ | "FindDuplicates" >> beam.FlatMap(find_duplicates).with_outputs( 'duplicates_with_different_values', main='safe_to_delete_duplicate_ids'))
def produce_calc_fn(inputs: PCollection, calc_producer: calc.CalcProducer, inputs_shape: calc.CalcInputs) -> PCollection: return (inputs \ | 'FilterByInputsShape' >> beam.Filter(filter_by_input_shapes, inputs_shape) \ | 'WithSymbolAsKey' >> beam.Map(lambda d: (d.symbol, d)) \ | 'GroupByKey' >> beam.GroupByKey() \ | 'Values' >> beam.Values() \ | 'PerformCalc' >> beam.FlatMap(perform_calc, inputs_shape, calc_producer))
def expand(self, p): avg_score = (p | beam.Values() | beam.CombineGlobally( beam.combiners.MeanCombineFn()).as_singleton_view() ) return (p | 'compute_spammers' >> beam.ParDo( FilterUser(self.score_weight), avg_score=avg_score) )
def expand(self, blog_model_pcoll): return ( blog_model_pcoll | 'Discard models with empty property value' >> (beam.Filter(lambda model: self.get_property_value(model) != '')) | 'Generate (%s, model) key value pairs' % self._property_name >> (beam.WithKeys(self.get_property_value)) # pylint: disable=no-value-for-parameter | 'Group pairs by their %s' % self._property_name >> (beam.GroupByKey()) | 'Discard %s key' % self._property_name >> beam.Values() # pylint: disable=no-value-for-parameter | 'Discard models with unique %s' % self._property_name >> (beam.Filter(lambda models: len(models) > 1)))
def write_proto_outputs(output_file, name, data, proto_message): """Write protos to a container.""" if output_file.endswith((".txtpb.gz", ".txtpb")): _ = (data | "DropKey_%s" % name >> beam.Values() | "ToTextProto" % name >> beam.Map( _proto_to_text, proto_message=proto_message, ) | "WriteTextExamples_%s" % name >> beam.io.WriteToText(output_file)) return elif output_file.endswith(".tfrecord"): _ = (data | "DropKey_%s" % name >> beam.Values() | "WriteTFRecordsExamples_%s" % name >> beam.io.WriteToTFRecord( file_path_prefix=output_file, shard_name_template="", coder=beam.coders.ProtoCoder(proto_message))) return raise ValueError(f"Unsupported output format: {output_file}")
def test_custormized_counters_in_combine_fn(self): p = TestPipeline() input = ( p | beam.Create([('key1', 'a'), ('key1', 'ab'), ('key1', 'abc'), ('key2', 'uvxy'), ('key2', 'uvxyz')])) # The result of concatenating all values regardless of key. global_concat = ( input | beam.Values() | beam.CombineGlobally(SortedConcatWithCounters())) # The (key, concatenated_string) pairs for all keys. concat_per_key = (input | beam.CombinePerKey(SortedConcatWithCounters())) # Verify the concatenated strings are correct. expected_concat_per_key = [('key1', 'aaabbc'), ('key2', 'uuvvxxyyz')] assert_that( global_concat, equal_to(['aaabbcuuvvxxyyz']), label='global concat') assert_that( concat_per_key, equal_to(expected_concat_per_key), label='concat per key') result = p.run() result.wait_until_finish() # Verify the values of metrics are correct. word_counter_filter = MetricsFilter().with_name('word_counter') query_result = result.metrics().query(word_counter_filter) if query_result['counters']: word_counter = query_result['counters'][0] self.assertEqual(word_counter.result, 5) word_lengths_filter = MetricsFilter().with_name('word_lengths') query_result = result.metrics().query(word_lengths_filter) if query_result['counters']: word_lengths = query_result['counters'][0] self.assertEqual(word_lengths.result, 15) word_len_dist_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_len_dist_filter) if query_result['distributions']: word_len_dist = query_result['distributions'][0] self.assertEqual(word_len_dist.result.mean, 3) last_word_len_filter = MetricsFilter().with_name('last_word_len') query_result = result.metrics().query(last_word_len_filter) if query_result['gauges']: last_word_len = query_result['gauges'][0] self.assertIn(last_word_len.result.value, [1, 2, 3, 4, 5])
def downsample(pcoll, n, random_seed): """Deterministic PCollection downsampling using sha256.""" return (pcoll | "HashKey" >> beam.Map(lambda x: (str(hash_key(x[0], random_seed)), x)).with_output_types( Tuple[str, Tuple[str, Dict]]) | "Downsample" >> beam.combiners.Top.Of( n, key=lambda x: x[0]).with_output_types( List[Tuple[str, Tuple[str, Dict]]]) | "Unpack" >> beam.FlatMap(lambda x: x) | "DropHashKey" >> beam.Values())
def test_values(self): expected = [ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] inputs = [(0, 'Sunday'), (1, 'Monday'), (2, 'Tuesday'), (3, 'Wednesday'), (4, 'Thursday'), (5, 'Friday'), (6, 'Saturday')] with TestPipeline() as p: actual = (p | beam.Create(inputs) | beam.Values()) assert_that(actual, equal_to(expected))
def _run_sampling(self, example_uris: Mapping[Text, Text], to_key_fn: Text, output_artifact: Artifact, samples_per_key: int) -> None: """Runs stratified sampling on given example data. Args: example_uris: Mapping of example split name to example uri. to_key_fn: function to convert an example to a key output_artifact: Output artifact. samples_per_key: number of examples to keep per value of the key. Returns: None """ d = {} exec(to_key_fn, globals(), d) # how ugly is that? to_key = d['to_key'] def to_keyed_value(m): return to_key(m), m with self._make_beam_pipeline() as pipeline: for split_name, example_uri in example_uris.items(): data_list = [ (pipeline | 'ReadData[{}]'.format(split_name) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) ] dest_path = os.path.join( artifact_utils.get_split_uri([output_artifact], split_name), _STRATIFIED_EXAMPLES_FILE_PREFIX) _ = ([data for data in data_list] | 'FlattenExamples ({})'.format(split_name) >> beam.Flatten(pipeline=pipeline) | 'ParseExamples ({})'.format(split_name) >> beam.Map( tf.train.Example.FromString) | 'Key ({})'.format(split_name) >> beam.Map(to_keyed_value) | 'Sample per key ({})'.format(split_name) >> beam.combiners.Sample.FixedSizePerKey(samples_per_key) | 'Values ({})'.format(split_name) >> beam.Values() | 'Flatten lists ({})'.format(split_name) >> beam.FlatMap(lambda elements: elements) | 'WriteStratifiedSamples ({})'.format(split_name) >> beam.io.WriteToTFRecord(dest_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder( tf.train.Example))) logging.info('Sampling result written to %s.', dest_path)
def expand(self, results): """Writes the given job results to the NDB datastore.""" return ( results # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model) | ndb_io.PutModels(self.datastoreio_stub))
def expand(self, pcoll): """Estimates the user defined statistic.""" return ( pcoll | 'AssignExampleToPartition' >> beam.Map( _assign_to_partition, num_partitions=self._num_partitions) | 'GroupPartitionsIntoList' >> beam.CombinePerKey( beam.combiners.SampleCombineFn(self._max_examples_per_partition)) | 'RemovePartitionKey' >> beam.Values() | 'BatchExamples' >> beam.Map(batch_util.merge_single_batch) | 'ComputeStatsFn' >> beam.Map(self._stats_fn.compute) | 'ComputeMetaStats' >> beam.CombineGlobally( PartitionedStatisticsAnalyzer(min_partitions_stat_presence=self ._min_partitions_stat_presence)))
def expand(self, uri_to_content): # Compute the total number of documents, and prepare a singleton # PCollection to use as side input. total_documents = ( uri_to_content | 'GetUris 1' >> beam.Keys() | 'GetUniqueUris' >> beam.RemoveDuplicates() | 'CountUris' >> beam.combiners.Count.Globally()) # Create a collection of pairs mapping a URI to each of the words # in the document associated with that that URI. def split_into_words((uri, line)): return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)] uri_to_words = ( uri_to_content | 'SplitWords' >> beam.FlatMap(split_into_words)) # Compute a mapping from each word to the total number of documents # in which it appears. word_to_doc_count = ( uri_to_words | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates() | 'GetWords' >> beam.Values() | 'CountDocsPerWord' >> beam.combiners.Count.PerElement()) # Compute a mapping from each URI to the total number of words in the # document associated with that URI. uri_to_word_total = ( uri_to_words | 'GetUris 2' >> beam.Keys() | 'CountWordsInDoc' >> beam.combiners.Count.PerElement()) # Count, for each (URI, word) pair, the number of occurrences of that word # in the document associated with the URI. uri_and_word_to_count = ( uri_to_words | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement()) # Adjust the above collection to a mapping from (URI, word) pairs to counts # into an isomorphic mapping from URI to (word, count) pairs, to prepare # for a join by the URI key. uri_to_word_and_count = ( uri_and_word_to_count | 'ShiftKeys' >> beam.Map( lambda ((uri, word), count): (uri, (word, count))))
def _WriteExamples(pcoll: beam.pvalue.PCollection, transformed_example_path: Text) -> beam.pvalue.PDone: """Writes transformed examples compressed in gzip format. Args: pcoll: PCollection of serialized transformed examples. transformed_example_path: path to write to. Returns: beam.pvalue.PDone. """ return ( pcoll | 'Values' >> beam.Values() | 'Write' >> beam.io.WriteToTFRecord( transformed_example_path, file_name_suffix='.gz'))
def create_test_pipeline( self, entry_stats: StatsType ) -> beam.PCollection[Dict[str, Union[int, Set[datetime.date]]]]: """Creates testing pipeline with some entry stats. Args: entry_stats: StatsType. The stast with which to start the pipeline. Returns: PCollection. The testing pipeline to be executed. """ return (self.pipeline | beam.Create(entry_stats) | beam.CombineValues(cron_jobs.CombineStats()) | beam.Values() # pylint: disable=no-value-for-parameter | beam.Map(lambda stats: stats.to_dict()))
def _MutualInformationTransformMerge( # pylint: disable=invalid-name pcol, use_adjusted_mutual_info, min_diff_from_avg): """Computes mutual information for each key using the given accumulators.""" feature_accumulator_pcol = (pcol | 'VocabCountPerLabelPerTokenMerge' >> beam.CombinePerKey(_WeightedMeanCombineFn())) global_accumulator = (feature_accumulator_pcol | 'DropKeys' >> beam.Values() | 'VocabCountPerLabelGlobally' >> beam.CombineGlobally(_WeightedMeanCombineFn())) return (feature_accumulator_pcol | 'CalculateMutualInformationPerToken' >> beam.Map( _calculate_mutual_information_for_binary_feature, beam.pvalue.AsSingleton(global_accumulator), use_adjusted_mutual_info=use_adjusted_mutual_info, min_diff_from_avg=min_diff_from_avg))
def expand(self, pcoll): field_names = [] for field_descriptor in dataset_pb2.BondTopologySummary.DESCRIPTOR.fields: if field_descriptor.name.startswith('count_'): field_names.append(field_descriptor.name) return (pcoll | 'CombineByBTID' >> beam.CombinePerKey( merge_bond_topology_summaries, field_names=field_names) | 'DropBTID' >> beam.Values() | 'Reshuffle' >> beam.Reshuffle() | 'CSVFormat' >> beam.Map(csv_format_bond_topology_summary, field_names=field_names) | 'WriteCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_bt_summary', header='bt_id,' + ','.join(field_names), num_shards=1, file_name_suffix='.csv'))
def values(test=None): # [START values] import apache_beam as beam with beam.Pipeline() as pipeline: plants = (pipeline | 'Garden plants' >> beam.Create([ ('🍓', 'Strawberry'), ('🥕', 'Carrot'), ('🍆', 'Eggplant'), ('🍅', 'Tomato'), ('🥔', 'Potato'), ]) | 'Values' >> beam.Values() | beam.Map(print)) # [END values] if test: test(plants)
def _WriteExamples(pcollection, unused_file_format, transformed_example_path): """Writes transformed examples compressed in gzip format. Args: pcollection: PCollection of transformed examples. unused_file_format: file format, unused. transformed_example_path: path to write to. Returns: beam.pvalue.PDone. """ return (pcollection | 'DropNoneKeys' >> beam.Values() | 'Write' >> beam.io.WriteToTFRecord( transformed_example_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder(example_pb2.Example)))
def _pipeline(root): interactions = ( create_data.read_interactions(root, inputs, name="input") | "DropKey" >> beam.Map(beam.Values()) | "ToRetrievalExample" >> beam.FlatMap(_to_retrieval_interaction_fn) | "Reshuffle" >> beam.transforms.util.Reshuffle()) # We expect ~37,568,664 interactions by taking 1 / 5000 for test test we # get a reasonable test set size of ~7513. beam_utils.split_by_table_id_and_write( interactions, output_dir, train_suffix="@*", test_suffix="@*", num_splits=5000, )
def write_from_pcollection(self, pcoll_examples): """Add the final steps of the beam pipeline: write to parquet files.""" import apache_beam as beam from .utils.beam_utils import WriteToParquet def inc_num_examples(example): beam.metrics.Metrics.counter(self._namespace, "num_examples").inc() # count examples _ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples) # save dataset return (pcoll_examples | "Get values" >> beam.Values() | "Save to parquet" >> WriteToParquet(self._parquet_path, self._schema, num_shards=1, shard_name_template=""))
def test_MeanCombineFn_combine(self): with TestPipeline() as p: input = ( p | beam.Create([('a', 1), ('a', 1), ('a', 4), ('b', 1), ('b', 13)])) # The mean of all values regardless of key. global_mean = ( input | beam.Values() | beam.CombineGlobally(combine.MeanCombineFn())) # The (key, mean) pairs for all keys. mean_per_key = (input | beam.CombinePerKey(combine.MeanCombineFn())) expected_mean_per_key = [('a', 2), ('b', 7)] assert_that(global_mean, equal_to([4]), label='global mean') assert_that( mean_per_key, equal_to(expected_mean_per_key), label='mean per key')
def test_custormized_counters_in_combine_fn_empty(self): p = TestPipeline() input = p | beam.Create([]) # The result of concatenating all values regardless of key. global_concat = ( input | beam.Values() | beam.CombineGlobally(SortedConcatWithCounters())) # The (key, concatenated_string) pairs for all keys. concat_per_key = (input | beam.CombinePerKey(SortedConcatWithCounters())) # Verify the concatenated strings are correct. assert_that(global_concat, equal_to(['']), label='global concat') assert_that(concat_per_key, equal_to([]), label='concat per key') result = p.run() result.wait_until_finish() # Verify the values of metrics are correct. word_counter_filter = MetricsFilter().with_name('word_counter') query_result = result.metrics().query(word_counter_filter) if query_result['counters']: word_counter = query_result['counters'][0] self.assertEqual(word_counter.result, 0) word_lengths_filter = MetricsFilter().with_name('word_lengths') query_result = result.metrics().query(word_lengths_filter) if query_result['counters']: word_lengths = query_result['counters'][0] self.assertEqual(word_lengths.result, 0) word_len_dist_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_len_dist_filter) if query_result['distributions']: word_len_dist = query_result['distributions'][0] self.assertEqual(word_len_dist.result.count, 0) last_word_len_filter = MetricsFilter().with_name('last_word_len') query_result = result.metrics().query(last_word_len_filter) # No element has ever been recorded. self.assertFalse(query_result['gauges'])
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \ -> beam.pvalue.PCollection: """Gets enriched events by a) Call a transform that combining primary event with corresponding side input values b) Group events by dummy key to combine all events in a window into one shard c) Discard dummy key Args: salesevent: Event representing sales transaction sideinput_collections: Set of Side Input Collections """ # yapf: disable return (salesevent | "Enrich event" >> beam.Map(transforms.enrich_event, AsDict(sideinput_collections["bonuspoints"]), AsDict(sideinput_collections["discountpct"]), AsDict(sideinput_collections["category"])) | "Group events by dummy Key" >> beam.GroupByKey() | "Discard dummy Key" >> beam.Values() )