コード例 #1
0
 def expand(self, news_to_score):
     top_news = (news_to_score
                 | 'ItemMean' >> beam.combiners.Mean.PerKey()
                 | 'KVSwap' >> beam.KvSwap()
                 | 'TopItem' >> beam.combiners.Top.Of(self.n)
                 | 'FormatItemList' >> beam.Map(self._list_format))
     return top_news
コード例 #2
0
    def expand(self, pcoll):
        top_k = self._spec.top_k
        frequency_threshold = self._spec.frequency_threshold
        assert top_k is None or top_k >= 0
        assert frequency_threshold is None or frequency_threshold >= 0

        # Creates a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).
        counts = (
            pcoll
            | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list)
            | 'CountWithinList' >>
            # Specification of with_output_types allows for combiner optimizations.
            beam.FlatMap(lambda lst: six.iteritems(Counter(lst))
                         ).with_output_types(KV[common.PRIMITIVE_TYPE, int])
            | 'CountGlobally' >> beam.CombinePerKey(sum)
            | 'SwapElementsAndCounts' >> beam.KvSwap())

        # Filtration is cheaper than TopK computation and the two commute, so do
        # filtration first.
        if frequency_threshold is not None:
            counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold
                       >> beam.Filter(lambda kv: kv[0] >= frequency_threshold))

        if top_k is not None:
            counts = (counts
                      | 'Top(%s)' % top_k >>
                      beam.transforms.combiners.Top.Largest(top_k)
                      | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

        # Performance optimization to obviate reading from finely sharded files
        # via AsIter. By forcing all data into a single group we end up reading
        # from a single file.
        #
        @beam.ptransform_fn
        def Reshard(pcoll):  # pylint: disable=invalid-name
            return (pcoll
                    | 'PairWithNone' >> beam.Map(lambda x: (None, x))
                    | 'GroupByNone' >> beam.GroupByKey()
                    | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))

        counts |= 'ReshardToOneGroup' >> Reshard()  # pylint: disable=no-value-for-parameter

        # Using AsIter instead of AsList below in order to reduce max memory
        # usage (due to AsList caching).
        def order_by_decreasing_counts(_, counts_iter):  # pylint: disable=invalid-name
            counts = list(counts_iter)
            counts.sort(reverse=True)  # Largest first.
            return [element for _, element in counts]

        # pylint: disable=no-value-for-parameter
        output = (pcoll.pipeline
                  | 'Prepare' >> beam.Create([None])
                  | 'OrderByDecreasingCounts' >> beam.Map(
                      order_by_decreasing_counts,
                      counts_iter=beam.pvalue.AsIter(counts))
                  | 'WrapAsNDArray' >> WrapAsNDArray(self._spec.dtype))
        return [output]
コード例 #3
0
    def expand(self, inputs):
        def _encode_values(k, v):
            return (k,
                    tf.compat.as_str_any(','.join(map(tf.compat.as_str_any,
                                                      v))))

        pcoll, = inputs
        return (pcoll
                | 'EncodeValues' >> beam.MapTuple(_encode_values)
                | 'SwapKeysAndValues' >> beam.KvSwap())
コード例 #4
0
    def test_kv_swap(self):
        expected = [('Friday', 5), ('Monday', 1), ('Saturday', 6),
                    ('Sunday', 0), ('Thursday', 4), ('Tuesday', 2),
                    ('Wednesday', 3)]

        inputs = [(0, 'Sunday'), (1, 'Monday'), (2, 'Tuesday'),
                  (3, 'Wednesday'), (4, 'Thursday'), (5, 'Friday'),
                  (6, 'Saturday')]

        with TestPipeline() as p:
            actual = (p | beam.Create(inputs) | beam.KvSwap())

            assert_that(actual, equal_to(expected))
コード例 #5
0
    def expand(self, inputs):
        if (self._vocab_ordering_type ==
                _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            combine_transform = _MutualInformationTransformMerge(  # pylint: disable=no-value-for-parameter
                self._use_adjusted_mutual_info, self._min_diff_from_avg)
        else:
            combine_transform = beam.CombinePerKey(sum)

        pcoll, = inputs

        raw_counts = (pcoll
                      | 'CountPerToken' >> combine_transform
                      | 'SwapTokensAndCounts' >> beam.KvSwap())

        return raw_counts
コード例 #6
0
def kvswap(test=None):
    # [START kvswap]
    import apache_beam as beam

    with beam.Pipeline() as pipeline:
        plants = (pipeline
                  | 'Garden plants' >> beam.Create([
                      ('🍓', 'Strawberry'),
                      ('🥕', 'Carrot'),
                      ('🍆', 'Eggplant'),
                      ('🍅', 'Tomato'),
                      ('🥔', 'Potato'),
                  ])
                  | 'Key-Value swap' >> beam.KvSwap()
                  | beam.Map(print))
        # [END kvswap]
        if test:
            test(plants)
コード例 #7
0
  def expand(self, inputs):
    pcoll, = inputs
    if self._top_k is not None and self._top_k < 0:
      raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
                       '{}.'.format(self._top_k))
    if self._frequency_threshold is not None and self._frequency_threshold < 0:
      raise ValueError(
          'frequency_threshold for VocabularyImpl should be >= 0 or None, '
          'got {}.'.format(self._frequency_threshold))

    # Create a PCollection of (count, element) pairs, then iterates over
    # this to create a single element PCollection containing this list of
    # pairs in sorted order by decreasing counts (and by values for equal
    # counts).

    def is_problematic_string(kv):
      string, _ = kv  # Ignore counts.
      return string and '\n' not in string and '\r' not in string

    if (self._vocab_ordering_type ==
        tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
      flatten_map_fn = (
          _flatten_positive_label_weights_total_weights_and_counts)

      # count_and_means is a pcollection that contains a
      # _CountAndWeightsMeansAccumulator where:
      #   `weighted_mean` is the weighted mean of positive labels
      #       for all features.
      #   `count` is the count for all features.
      #   `weights_mean` is the mean of the weights for all features.
      count_and_means = (
          pcoll
          | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means)
          | 'ComputeCountAndWeightsMeansGlobally' >> beam.CombineGlobally(
              CountAndWeightsMeansCombineFn()))

      # CountAndWeightsMeansCombineFn returns a tuple of the form:
      # (feature,_CountAndWeightsMeansAccumulator) where:
      #   `feature` is a single string, which is the word in the vocabulary
      #       whose mutual information with the label is being computed.
      #   `weighted_mean` is the weighted mean of y positive given x.
      #   `count` is the count of weights for a feature.
      #   `weights_mean` is the mean of the weights for a feature.
      combine_transform = (
          'ComputeCountAndWeightsMeansPerUniqueWord' >> beam.CombinePerKey(
              CountAndWeightsMeansCombineFn())
          | 'CalculateMutualInformationPerUniqueWord' >> beam.Map(
              _calculate_mutual_information,
              global_accumulator=beam.pvalue.AsSingleton(count_and_means)))
    elif (self._vocab_ordering_type ==
          tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
      flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum)
    else:
      flatten_map_fn = _flatten_value_to_list
      combine_transform = beam.combiners.Count.PerElement()

    raw_counts = (
        pcoll
        | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
        | 'CountPerString' >> combine_transform
        | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
        | 'SwapStringsAndCounts' >> beam.KvSwap())

    counts = (
        raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
            _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                self._frequency_threshold,
                self._top_k
                )))

    return counts | 'WriteVocabFile' >> (
        _WriteVocabFile(  # pylint: disable=no-value-for-parameter
            self._base_temp_dir, self._vocab_filename, self._store_frequency))
コード例 #8
0
    def expand(self, pcoll):
        top_k = self._spec.top_k
        frequency_threshold = self._spec.frequency_threshold
        assert top_k is None or top_k >= 0
        assert frequency_threshold is None or frequency_threshold >= 0

        # Creates a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).
        counts = (
            pcoll
            | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list)
            | 'CountWithinList' >>
            # Specification of with_output_types allows for combiner optimizations.
            (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))).
             with_output_types(KV[common.PRIMITIVE_TYPE, int]))
            | 'CountGlobally' >> beam.CombinePerKey(sum))

        counts = (counts
                  | 'FilterProblematicStrings' >> beam.Filter(lambda kv: kv[
                      0] and '\n' not in kv[0] and '\r' not in kv[0])
                  | 'SwapElementsAndCounts' >> beam.KvSwap())

        # Filter is cheaper than TopK computation and the two commute, so
        # filter first.
        if frequency_threshold is not None:
            counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold
                       >> beam.Filter(lambda kv: kv[0] >= frequency_threshold))

        if top_k is not None:
            counts = (counts
                      | 'Top(%s)' % top_k >>
                      beam.transforms.combiners.Top.Largest(top_k)
                      | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

        # Performance optimization to obviate reading from finely sharded files
        # via AsIter. By breaking fusion, we allow sharded files' sizes to be
        # automatically computed (when possible), so we end up reading from fewer
        # and larger files.
        counts |= 'Reshard' >> beam.transforms.Reshuffle()  # pylint: disable=no-value-for-parameter

        # Using AsIter instead of AsList below in order to reduce max memory
        # usage (due to AsList caching).
        def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
            """Sort the vocabulary by frequency count."""
            del ignored
            counts = list(counts_iter)
            if not counts:
                counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
            counts.sort(reverse=True)  # Largest first.

            # Log vocabulary size to metrics.  Note we can call
            # beam.metrics.Metrics.distribution here because this function only gets
            # called once, so there is no need to amortize the cost of calling the
            # constructor by putting in a DoFn initializer.
            vocab_size_distribution = beam.metrics.Metrics.distribution(
                common.METRICS_NAMESPACE, 'vocabulary_size')
            vocab_size_distribution.update(len(counts))

            if store_frequency:
                # Returns ['count1 element1', ... ]
                return [
                    '{} {}'.format(count, element) for count, element in counts
                ]
            else:
                return [element for _, element in counts]

        vocabulary_file = os.path.join(self._temp_assets_dir,
                                       self._spec.vocab_filename)
        vocab_is_written = (pcoll.pipeline
                            | 'Prepare' >> beam.Create([None])
                            | 'OrderByDecreasingCounts' >> beam.FlatMap(
                                order_by_decreasing_counts,
                                counts_iter=beam.pvalue.AsIter(counts),
                                store_frequency=self._spec.store_frequency)
                            | 'WriteToFile' >> beam.io.WriteToText(
                                vocabulary_file, shard_name_template=''))
        # Return the vocabulary path.
        wait_for_vocabulary_transform = (
            pcoll.pipeline
            | 'CreatePath' >> beam.Create([np.array(vocabulary_file)])
            # Ensure that the analysis returns only after the file is written.
            | 'WaitForVocabularyFile' >> beam.Map(
                lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
        return (wait_for_vocabulary_transform, )
コード例 #9
0
    def expand(self, inputs):
        pcoll, = inputs
        if self._top_k is not None and self._top_k < 0:
            raise ValueError(
                'top_k for VocabularyImpl should be >= 0 or None, got '
                '{}.'.format(self._top_k))
        if self._frequency_threshold is not None and self._frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold for VocabularyImpl should be >= 0 or None, '
                'got {}.'.format(self._frequency_threshold))
        if self._coverage_top_k is not None and self._coverage_top_k < 0:
            raise ValueError(
                'coverage_top_k for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_top_k))
        if (self._coverage_frequency_threshold is not None
                and self._coverage_frequency_threshold < 0):
            raise ValueError(
                'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
                'None, got {}.'.format(self._coverage_frequency_threshold))

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and b'\n' not in string and b'\r' not in string

        if (self._vocab_ordering_type ==
                tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
            flatten_map_fn = _flatten_to_key_and_means_accumulator_list
            combine_transform = _MutualInformationTransform(  # pylint: disable=no-value-for-parameter
                self._use_adjusted_mutual_info, self._min_diff_from_avg)
        elif (self._vocab_ordering_type ==
              tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
            flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = _flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        raw_counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeightsLabels' >>
            beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        counts = (
            raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
                _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                    self._frequency_threshold, self._top_k, None)))

        if self._key_fn:
            coverage_counts = (
                raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> (
                    _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                        self._coverage_frequency_threshold,
                        self._coverage_top_k, self._key_fn)))

            counts = ((counts, coverage_counts)
                      | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                      | 'RemoveDuplicates' >> beam.RemoveDuplicates())

        return counts | 'WriteVocabFile' >> (
            _WriteVocabFile(  # pylint: disable=no-value-for-parameter
                self._base_temp_dir, self._vocab_filename,
                self._store_frequency))
コード例 #10
0
ファイル: analyzer_impls.py プロジェクト: cclauss/transform
    def expand(self, pcoll):
        top_k = self._spec.top_k
        frequency_threshold = self._spec.frequency_threshold
        assert top_k is None or top_k >= 0
        assert frequency_threshold is None or frequency_threshold >= 0

        # Creates a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).
        counts = (
            pcoll
            | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list)
            | 'CountWithinList' >>
            # Specification of with_output_types allows for combiner optimizations.
            (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))).
             with_output_types(KV[common.PRIMITIVE_TYPE, int]))
            | 'CountGlobally' >> beam.CombinePerKey(sum))

        counts = (counts
                  | 'FilterEmptyStrings' >> beam.Filter(lambda kv: kv[
                      0] and '\n' not in kv[0] and '\r' not in kv[0])
                  | 'SwapElementsAndCounts' >> beam.KvSwap())

        # Filter is cheaper than TopK computation and the two commute, so
        # filter first.
        if frequency_threshold is not None:
            counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold
                       >> beam.Filter(lambda kv: kv[0] >= frequency_threshold))

        if top_k is not None:
            counts = (counts
                      | 'Top(%s)' % top_k >>
                      beam.transforms.combiners.Top.Largest(top_k)
                      | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

        # Performance optimization to obviate reading from finely sharded files
        # via AsIter. By forcing all data into a single group we end up reading
        # from a single file.
        #
        @beam.ptransform_fn
        def Reshard(pcoll):  # pylint: disable=invalid-name
            return (pcoll
                    | 'PairWithNone' >> beam.Map(lambda x: (None, x))
                    | 'GroupByNone' >> beam.GroupByKey()
                    | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))

        counts |= 'ReshardToOneGroup' >> Reshard()  # pylint: disable=no-value-for-parameter

        # Using AsIter instead of AsList below in order to reduce max memory
        # usage (due to AsList caching).
        def order_by_decreasing_counts(_, counts_iter):  # pylint: disable=invalid-name
            counts = list(counts_iter)
            if not counts:
                counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
            counts.sort(reverse=True)  # Largest first.
            return [element for _, element in counts]

        vocabulary_file = os.path.join(self._temp_assets_dir,
                                       self._spec.vocab_filename)
        vocab_is_written = (pcoll.pipeline
                            | 'Prepare' >> beam.Create([None])
                            | 'OrderByDecreasingCounts' >> beam.FlatMap(
                                order_by_decreasing_counts,
                                counts_iter=beam.pvalue.AsIter(counts))
                            | 'WriteToFile' >> beam.io.WriteToText(
                                vocabulary_file, shard_name_template=''))
        # Return the vocabulary path.
        wait_for_vocabulary_transform = (
            pcoll.pipeline
            | 'CreatePath' >> beam.Create([vocabulary_file])
            # Ensure that the analysis returns only after the file is written.
            | 'WaitForVocabularyFile' >> beam.Map(
                lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
        return [wait_for_vocabulary_transform]
コード例 #11
0
ファイル: impl.py プロジェクト: isunspot/transform
        def expand(self, pcoll):
            def combine_by_batch(fn):
                """Reduces a PCollection of batches according to the given function."""
                return (pcoll
                        | 'FlattenValue'  # Flatten N-d values into 1-d.
                        >> beam.Map(lambda x: np.array(x).ravel())
                        | 'CombineWithinBatch' >> beam.Map(fn)
                        | 'CombineGlobally' >>
                        beam.CombineGlobally(fn).without_defaults())

            analysis_result = None
            if self._analyzer_name == api.CanonicalAnalyzers.MIN:
                assert not self._args_dict
                analysis_result = combine_by_batch(min)

            elif self._analyzer_name == api.CanonicalAnalyzers.MAX:
                assert not self._args_dict
                analysis_result = combine_by_batch(max)

            elif self._analyzer_name == api.CanonicalAnalyzers.SUM:
                assert not self._args_dict
                analysis_result = combine_by_batch(sum)

            elif self._analyzer_name == api.CanonicalAnalyzers.UNIQUES:
                top_k = self._args_dict['top_k']
                assert top_k is None or top_k >= 0

                frequency_threshold = self._args_dict['frequency_threshold']
                assert frequency_threshold is None or frequency_threshold >= 0

                # Creates a PCollection of (count, element) pairs, then iterates over
                # this to create a single element PCollection containing this list of
                # pairs in sorted order by decreasing counts (and by values for equal
                # counts).

                def to_iterable(instance_value):
                    if isinstance(instance_value,
                                  (six.string_types, float, int, long)):
                        return [instance_value]
                    else:
                        # Value is a list or ndarray and so is already an iterable.
                        return instance_value

                counts = (pcoll
                          | 'Unbatch' >> beam.FlatMap(lambda batch: batch)
                          | 'ExtractElements' >> beam.FlatMap(to_iterable)
                          | 'CountPerElement' >>
                          beam.transforms.combiners.Count.PerElement()
                          | 'SwapElementsAndCounts' >> beam.KvSwap())

                if top_k is not None:
                    counts = (counts
                              | 'Top_%s' % top_k >>
                              beam.transforms.combiners.Top.Largest(top_k)
                              | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

                if frequency_threshold is not None:
                    counts |= (
                        'FilterByFrequencyThreshold_%s' % frequency_threshold
                        >>
                        beam.Filter(lambda kv: kv[0] >= frequency_threshold))

                # Using AsIter instead of AsList below in order to reduce max memory
                # usage (due to AsList caching).
                def order_by_decreasing_counts(_, counts_iter):
                    counts = list(counts_iter)
                    counts.sort(reverse=True)  # Largest first.
                    return [element for _, element in counts]

                analysis_result = (pcoll.pipeline
                                   | 'Prepare' >> beam.Create([None])
                                   | 'OrderByDecreasingCounts' >> beam.Map(
                                       order_by_decreasing_counts,
                                       counts_iter=beam.pvalue.AsIter(counts)))
            else:
                raise NotImplementedError(self._analyzer_name)

            # Note we pass in dtype as string and shape as a tuple, to avoid pickling
            # issues (b/35133536)
            return (analysis_result
                    | 'ConstantTensorValue' >> beam.Map(
                        impl_helper.ConstantTensorValue,
                        dtype=self._tensor.dtype.name,
                        shape=tuple(dim.value
                                    for dim in self._tensor.get_shape())))
コード例 #12
0
ファイル: analyzer_impls.py プロジェクト: xiching/transform
    def expand(self, pcoll):
        top_k = self._spec.top_k
        frequency_threshold = self._spec.frequency_threshold
        assert top_k is None or top_k >= 0
        assert frequency_threshold is None or frequency_threshold >= 0

        # Create a PCollection of (count, element) pairs, then iterates over
        # this to create a single element PCollection containing this list of
        # pairs in sorted order by decreasing counts (and by values for equal
        # counts).

        def flatten_value_and_weights_to_list_of_tuples(batch_values):
            """Converts a batch of vocabulary and weights to a list of KV tuples."""
            batch_value, weights = batch_values

            batch_value = batch_value.tolist()
            weights = weights.tolist()
            if len(batch_value) != len(weights):
                raise ValueError(
                    'Values and weights contained different number of values ({} vs {})'
                    .format(len(batch_value), len(weights)))
            return zip(batch_value, weights)

        def flatten_value_to_list(batch_values):
            """Converts an N-D dense or sparse batch to a 1-D list."""
            batch_value, = batch_values

            return batch_value.tolist()

        if self._spec.has_weights:
            flatten_map_fn = flatten_value_and_weights_to_list_of_tuples
            combine_transform = beam.CombinePerKey(sum)
        else:
            flatten_map_fn = flatten_value_to_list
            combine_transform = beam.combiners.Count.PerElement()

        def is_problematic_string(kv):
            string, _ = kv  # Ignore counts.
            return string and '\n' not in string and '\r' not in string

        counts = (
            pcoll
            | 'FlattenStringsAndMaybeWeights' >> beam.FlatMap(flatten_map_fn)
            | 'CountPerString' >> combine_transform
            | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
            | 'SwapStringsAndCounts' >> beam.KvSwap())

        # Filter is cheaper than TopK computation and the two commute, so
        # filter first.
        if frequency_threshold is not None:
            counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold
                       >> beam.Filter(lambda kv: kv[0] >= frequency_threshold))

        if top_k is None:
            # Performance optimization to obviate reading from finely sharded files
            # via AsIter in order_elements below. By breaking fusion, we allow sharded
            # files' sizes to be automatically computed (when possible), so we end up
            # reading from fewer and larger files. This is not needed when top_k is
            # provided since that already induces a single-sharded output (due to the
            # CombineGlobaly).
            counts |= 'Reshard' >> beam.transforms.Reshuffle()  # pylint: disable=no-value-for-parameter
        else:
            counts = (
                counts
                | 'Top(%s)' % top_k
                # Using without_defaults() below since it obviates unnecessary
                # materializations. This is worth doing because:
                # a) Some vocabs could be really large and allthough they do
                #    fit in memory they might go over per-record
                #    materialization limits (TopCombineFn is producing
                #    single-record with the entire vocabulary as a list).
                # b) More fusion leads to increased performance in general.
                >> beam.CombineGlobally(
                    beam.combiners.TopCombineFn(top_k)).without_defaults()
                | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

        vocabulary_file = os.path.join(self._temp_assets_dir,
                                       self._spec.vocab_filename)
        vocab_is_written = (
            pcoll.pipeline
            | 'Prepare' >> beam.Create([None])
            | 'OrderElements' >> beam.ParDo(
                _OrderElementsFn(self._spec.store_frequency),
                # Using AsIter instead of AsList at the callsite below in order to
                # reduce max memory usage.
                counts_iter=beam.pvalue.AsIter(counts))
            | 'WriteToFile' >> beam.io.WriteToText(vocabulary_file,
                                                   shard_name_template=''))
        # Return the vocabulary path.
        wait_for_vocabulary_transform = (
            pcoll.pipeline
            | 'CreatePath' >> beam.Create([np.array(vocabulary_file)])
            # Ensure that the analysis returns only after the file is written.
            | 'WaitForVocabularyFile' >> beam.Map(
                lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
        return (wait_for_vocabulary_transform, )