def expand(self, news_to_score): top_news = (news_to_score | 'ItemMean' >> beam.combiners.Mean.PerKey() | 'KVSwap' >> beam.KvSwap() | 'TopItem' >> beam.combiners.Top.Of(self.n) | 'FormatItemList' >> beam.Map(self._list_format)) return top_news
def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). counts = ( pcoll | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list) | 'CountWithinList' >> # Specification of with_output_types allows for combiner optimizations. beam.FlatMap(lambda lst: six.iteritems(Counter(lst)) ).with_output_types(KV[common.PRIMITIVE_TYPE, int]) | 'CountGlobally' >> beam.CombinePerKey(sum) | 'SwapElementsAndCounts' >> beam.KvSwap()) # Filtration is cheaper than TopK computation and the two commute, so do # filtration first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is not None: counts = (counts | 'Top(%s)' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) # Performance optimization to obviate reading from finely sharded files # via AsIter. By forcing all data into a single group we end up reading # from a single file. # @beam.ptransform_fn def Reshard(pcoll): # pylint: disable=invalid-name return (pcoll | 'PairWithNone' >> beam.Map(lambda x: (None, x)) | 'GroupByNone' >> beam.GroupByKey() | 'ExtractValues' >> beam.FlatMap(lambda x: x[1])) counts |= 'ReshardToOneGroup' >> Reshard() # pylint: disable=no-value-for-parameter # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(_, counts_iter): # pylint: disable=invalid-name counts = list(counts_iter) counts.sort(reverse=True) # Largest first. return [element for _, element in counts] # pylint: disable=no-value-for-parameter output = (pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.Map( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts)) | 'WrapAsNDArray' >> WrapAsNDArray(self._spec.dtype)) return [output]
def expand(self, inputs): def _encode_values(k, v): return (k, tf.compat.as_str_any(','.join(map(tf.compat.as_str_any, v)))) pcoll, = inputs return (pcoll | 'EncodeValues' >> beam.MapTuple(_encode_values) | 'SwapKeysAndValues' >> beam.KvSwap())
def test_kv_swap(self): expected = [('Friday', 5), ('Monday', 1), ('Saturday', 6), ('Sunday', 0), ('Thursday', 4), ('Tuesday', 2), ('Wednesday', 3)] inputs = [(0, 'Sunday'), (1, 'Monday'), (2, 'Tuesday'), (3, 'Wednesday'), (4, 'Thursday'), (5, 'Friday'), (6, 'Saturday')] with TestPipeline() as p: actual = (p | beam.Create(inputs) | beam.KvSwap()) assert_that(actual, equal_to(expected))
def expand(self, inputs): if (self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): combine_transform = _MutualInformationTransformMerge( # pylint: disable=no-value-for-parameter self._use_adjusted_mutual_info, self._min_diff_from_avg) else: combine_transform = beam.CombinePerKey(sum) pcoll, = inputs raw_counts = (pcoll | 'CountPerToken' >> combine_transform | 'SwapTokensAndCounts' >> beam.KvSwap()) return raw_counts
def kvswap(test=None): # [START kvswap] import apache_beam as beam with beam.Pipeline() as pipeline: plants = (pipeline | 'Garden plants' >> beam.Create([ ('🍓', 'Strawberry'), ('🥕', 'Carrot'), ('🍆', 'Eggplant'), ('🍅', 'Tomato'), ('🥔', 'Potato'), ]) | 'Key-Value swap' >> beam.KvSwap() | beam.Map(print)) # [END kvswap] if test: test(plants)
def expand(self, inputs): pcoll, = inputs if self._top_k is not None and self._top_k < 0: raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and '\n' not in string and '\r' not in string if (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = ( _flatten_positive_label_weights_total_weights_and_counts) # count_and_means is a pcollection that contains a # _CountAndWeightsMeansAccumulator where: # `weighted_mean` is the weighted mean of positive labels # for all features. # `count` is the count for all features. # `weights_mean` is the mean of the weights for all features. count_and_means = ( pcoll | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means) | 'ComputeCountAndWeightsMeansGlobally' >> beam.CombineGlobally( CountAndWeightsMeansCombineFn())) # CountAndWeightsMeansCombineFn returns a tuple of the form: # (feature,_CountAndWeightsMeansAccumulator) where: # `feature` is a single string, which is the word in the vocabulary # whose mutual information with the label is being computed. # `weighted_mean` is the weighted mean of y positive given x. # `count` is the count of weights for a feature. # `weights_mean` is the mean of the weights for a feature. combine_transform = ( 'ComputeCountAndWeightsMeansPerUniqueWord' >> beam.CombinePerKey( CountAndWeightsMeansCombineFn()) | 'CalculateMutualInformationPerUniqueWord' >> beam.Map( _calculate_mutual_information, global_accumulator=beam.pvalue.AsSingleton(count_and_means))) elif (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY): flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() raw_counts = ( pcoll | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) counts = ( raw_counts | 'ApplyFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k ))) return counts | 'WriteVocabFile' >> ( _WriteVocabFile( # pylint: disable=no-value-for-parameter self._base_temp_dir, self._vocab_filename, self._store_frequency))
def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). counts = ( pcoll | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list) | 'CountWithinList' >> # Specification of with_output_types allows for combiner optimizations. (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))). with_output_types(KV[common.PRIMITIVE_TYPE, int])) | 'CountGlobally' >> beam.CombinePerKey(sum)) counts = (counts | 'FilterProblematicStrings' >> beam.Filter(lambda kv: kv[ 0] and '\n' not in kv[0] and '\r' not in kv[0]) | 'SwapElementsAndCounts' >> beam.KvSwap()) # Filter is cheaper than TopK computation and the two commute, so # filter first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is not None: counts = (counts | 'Top(%s)' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) # Performance optimization to obviate reading from finely sharded files # via AsIter. By breaking fusion, we allow sharded files' sizes to be # automatically computed (when possible), so we end up reading from fewer # and larger files. counts |= 'Reshard' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(ignored, counts_iter, store_frequency): """Sort the vocabulary by frequency count.""" del ignored counts = list(counts_iter) if not counts: counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')] counts.sort(reverse=True) # Largest first. # Log vocabulary size to metrics. Note we can call # beam.metrics.Metrics.distribution here because this function only gets # called once, so there is no need to amortize the cost of calling the # constructor by putting in a DoFn initializer. vocab_size_distribution = beam.metrics.Metrics.distribution( common.METRICS_NAMESPACE, 'vocabulary_size') vocab_size_distribution.update(len(counts)) if store_frequency: # Returns ['count1 element1', ... ] return [ '{} {}'.format(count, element) for count, element in counts ] else: return [element for _, element in counts] vocabulary_file = os.path.join(self._temp_assets_dir, self._spec.vocab_filename) vocab_is_written = (pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.FlatMap( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts), store_frequency=self._spec.store_frequency) | 'WriteToFile' >> beam.io.WriteToText( vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( pcoll.pipeline | 'CreatePath' >> beam.Create([np.array(vocabulary_file)]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return (wait_for_vocabulary_transform, )
def expand(self, inputs): pcoll, = inputs if self._top_k is not None and self._top_k < 0: raise ValueError( 'top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) if self._coverage_top_k is not None and self._coverage_top_k < 0: raise ValueError( 'coverage_top_k for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_top_k)) if (self._coverage_frequency_threshold is not None and self._coverage_frequency_threshold < 0): raise ValueError( 'coverage_frequency_threshold for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_frequency_threshold)) # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and b'\n' not in string and b'\r' not in string if (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = _flatten_to_key_and_means_accumulator_list combine_transform = _MutualInformationTransform( # pylint: disable=no-value-for-parameter self._use_adjusted_mutual_info, self._min_diff_from_avg) elif (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY): flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() raw_counts = ( pcoll | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) counts = ( raw_counts | 'ApplyFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k, None))) if self._key_fn: coverage_counts = ( raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._coverage_frequency_threshold, self._coverage_top_k, self._key_fn))) counts = ((counts, coverage_counts) | 'MergeStandardAndCoverageArms' >> beam.Flatten() | 'RemoveDuplicates' >> beam.RemoveDuplicates()) return counts | 'WriteVocabFile' >> ( _WriteVocabFile( # pylint: disable=no-value-for-parameter self._base_temp_dir, self._vocab_filename, self._store_frequency))
def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). counts = ( pcoll | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list) | 'CountWithinList' >> # Specification of with_output_types allows for combiner optimizations. (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))). with_output_types(KV[common.PRIMITIVE_TYPE, int])) | 'CountGlobally' >> beam.CombinePerKey(sum)) counts = (counts | 'FilterEmptyStrings' >> beam.Filter(lambda kv: kv[ 0] and '\n' not in kv[0] and '\r' not in kv[0]) | 'SwapElementsAndCounts' >> beam.KvSwap()) # Filter is cheaper than TopK computation and the two commute, so # filter first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is not None: counts = (counts | 'Top(%s)' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) # Performance optimization to obviate reading from finely sharded files # via AsIter. By forcing all data into a single group we end up reading # from a single file. # @beam.ptransform_fn def Reshard(pcoll): # pylint: disable=invalid-name return (pcoll | 'PairWithNone' >> beam.Map(lambda x: (None, x)) | 'GroupByNone' >> beam.GroupByKey() | 'ExtractValues' >> beam.FlatMap(lambda x: x[1])) counts |= 'ReshardToOneGroup' >> Reshard() # pylint: disable=no-value-for-parameter # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(_, counts_iter): # pylint: disable=invalid-name counts = list(counts_iter) if not counts: counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')] counts.sort(reverse=True) # Largest first. return [element for _, element in counts] vocabulary_file = os.path.join(self._temp_assets_dir, self._spec.vocab_filename) vocab_is_written = (pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.FlatMap( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts)) | 'WriteToFile' >> beam.io.WriteToText( vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( pcoll.pipeline | 'CreatePath' >> beam.Create([vocabulary_file]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return [wait_for_vocabulary_transform]
def expand(self, pcoll): def combine_by_batch(fn): """Reduces a PCollection of batches according to the given function.""" return (pcoll | 'FlattenValue' # Flatten N-d values into 1-d. >> beam.Map(lambda x: np.array(x).ravel()) | 'CombineWithinBatch' >> beam.Map(fn) | 'CombineGlobally' >> beam.CombineGlobally(fn).without_defaults()) analysis_result = None if self._analyzer_name == api.CanonicalAnalyzers.MIN: assert not self._args_dict analysis_result = combine_by_batch(min) elif self._analyzer_name == api.CanonicalAnalyzers.MAX: assert not self._args_dict analysis_result = combine_by_batch(max) elif self._analyzer_name == api.CanonicalAnalyzers.SUM: assert not self._args_dict analysis_result = combine_by_batch(sum) elif self._analyzer_name == api.CanonicalAnalyzers.UNIQUES: top_k = self._args_dict['top_k'] assert top_k is None or top_k >= 0 frequency_threshold = self._args_dict['frequency_threshold'] assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def to_iterable(instance_value): if isinstance(instance_value, (six.string_types, float, int, long)): return [instance_value] else: # Value is a list or ndarray and so is already an iterable. return instance_value counts = (pcoll | 'Unbatch' >> beam.FlatMap(lambda batch: batch) | 'ExtractElements' >> beam.FlatMap(to_iterable) | 'CountPerElement' >> beam.transforms.combiners.Count.PerElement() | 'SwapElementsAndCounts' >> beam.KvSwap()) if top_k is not None: counts = (counts | 'Top_%s' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) if frequency_threshold is not None: counts |= ( 'FilterByFrequencyThreshold_%s' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(_, counts_iter): counts = list(counts_iter) counts.sort(reverse=True) # Largest first. return [element for _, element in counts] analysis_result = (pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.Map( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts))) else: raise NotImplementedError(self._analyzer_name) # Note we pass in dtype as string and shape as a tuple, to avoid pickling # issues (b/35133536) return (analysis_result | 'ConstantTensorValue' >> beam.Map( impl_helper.ConstantTensorValue, dtype=self._tensor.dtype.name, shape=tuple(dim.value for dim in self._tensor.get_shape())))
def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def flatten_value_and_weights_to_list_of_tuples(batch_values): """Converts a batch of vocabulary and weights to a list of KV tuples.""" batch_value, weights = batch_values batch_value = batch_value.tolist() weights = weights.tolist() if len(batch_value) != len(weights): raise ValueError( 'Values and weights contained different number of values ({} vs {})' .format(len(batch_value), len(weights))) return zip(batch_value, weights) def flatten_value_to_list(batch_values): """Converts an N-D dense or sparse batch to a 1-D list.""" batch_value, = batch_values return batch_value.tolist() if self._spec.has_weights: flatten_map_fn = flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and '\n' not in string and '\r' not in string counts = ( pcoll | 'FlattenStringsAndMaybeWeights' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) # Filter is cheaper than TopK computation and the two commute, so # filter first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is None: # Performance optimization to obviate reading from finely sharded files # via AsIter in order_elements below. By breaking fusion, we allow sharded # files' sizes to be automatically computed (when possible), so we end up # reading from fewer and larger files. This is not needed when top_k is # provided since that already induces a single-sharded output (due to the # CombineGlobaly). counts |= 'Reshard' >> beam.transforms.Reshuffle() # pylint: disable=no-value-for-parameter else: counts = ( counts | 'Top(%s)' % top_k # Using without_defaults() below since it obviates unnecessary # materializations. This is worth doing because: # a) Some vocabs could be really large and allthough they do # fit in memory they might go over per-record # materialization limits (TopCombineFn is producing # single-record with the entire vocabulary as a list). # b) More fusion leads to increased performance in general. >> beam.CombineGlobally( beam.combiners.TopCombineFn(top_k)).without_defaults() | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) vocabulary_file = os.path.join(self._temp_assets_dir, self._spec.vocab_filename) vocab_is_written = ( pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderElements' >> beam.ParDo( _OrderElementsFn(self._spec.store_frequency), # Using AsIter instead of AsList at the callsite below in order to # reduce max memory usage. counts_iter=beam.pvalue.AsIter(counts)) | 'WriteToFile' >> beam.io.WriteToText(vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( pcoll.pipeline | 'CreatePath' >> beam.Create([np.array(vocabulary_file)]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return (wait_for_vocabulary_transform, )