Example #1
0
    def expand(
        self, results: beam.PCollection[job_run_result.JobRunResult]
    ) -> beam.pvalue.PDone:
        """Writes the given job results to the NDB datastore.

        This overrides expand from parent class.

        Args:
            results: PCollection. Models, can also contain just one model.

        Returns:
            PCollection. An empty PCollection.
        """
        return (
            results
            # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
            # different signature than the one it's defined with.
            | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
            # GroupIntoBatches() requires (key, value) pairs as input, so we
            # give everything None keys and then immediately discard them.
            | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
            | beam.Values()  # pylint: disable=no-value-for-parameter
            | beam.FlatMap(job_run_result.JobRunResult.accumulate)
            | beam.Map(self.create_beam_job_run_result_model,
                       results.pipeline.options.namespace)
            | ndb_io.PutModels())
Example #2
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:

        collection_pairs = (
            self.pipeline
            | 'get collection models ' >> ndb_io.GetModels(
                collection_models.CollectionRightsModel.get_all())
            | 'Flatten owner_ids and format' >> beam.FlatMap(
                self._extract_user_and_collection_ids))

        user_pairs = (self.pipeline
                      | 'Get all user settings models' >> ndb_io.GetModels(
                          user_models.UserSettingsModel.get_all())
                      | 'Extract id and email' >>
                      beam.Map(lambda user_setting:
                               (user_setting.id, user_setting.email)))

        collection_ids_to_email_mapping = (
            (collection_pairs, user_pairs)
            | 'Group by user_id' >> beam.CoGroupByKey()
            | 'Drop user id' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Filter out results without any collection' >>
            beam.Filter(lambda collection_ids_and_email: len(
                collection_ids_and_email[0]) > 0))

        return (
            collection_ids_to_email_mapping
            | 'Get final result' >>
            beam.MapTuple(lambda collection, email: job_run_result.JobRunResult
                          .as_stdout('collection_ids: %s, email: %s' %
                                     (collection, email))))
Example #3
0
    def expand(
        self, pcollections: Tuple[beam.pvalue.PCollection,
                                  beam.pvalue.PCollection]
    ) -> Tuple[beam.pvalue.PCollection, beam.pvalue.PCollection]:
        training_examples, serving_examples = pcollections
        keyed_training_examples = (
            training_examples | "ExtractTrainingIdentifiers" >> beam.ParDo(
                _ExtractIdentifiers(self._identifier_features,
                                    self._float_round_ndigits)))
        keyed_serving_examples = (
            serving_examples | "ExtractServingIdentifiers" >> beam.ParDo(
                _ExtractIdentifiers(self._identifier_features,
                                    self._float_round_ndigits)))
        results = ({
            "training": keyed_training_examples,
            "serving": keyed_serving_examples
        } | "JoinExamples" >> beam.CoGroupByKey()
                   | "ComputeSkew" >> beam.ParDo(
                       _ComputeSkew(
                           self._features_to_ignore, self._float_round_ndigits,
                           self._allow_duplicate_identifiers)).with_outputs(
                               "skew_results", "skew_pairs"))
        skew_results = (
            results.skew_results | "MergeSkewResultsPerFeature" >>  # pytype: disable=attribute-error
            beam.CombinePerKey(_merge_feature_skew_results)
            | "DropKeys" >> beam.Values())
        skew_pairs = (
            results.skew_pairs | "SampleSkewPairs" >>  # pytype: disable=attribute-error
            beam.combiners.Sample.FixedSizeGlobally(self._sample_size)
            # Sampling results in a pcollection with a single element consisting of
            # a list of the samples. Convert this to a pcollection of samples.
            | "Flatten" >> beam.FlatMap(lambda x: x))

        return skew_results, skew_pairs
Example #4
0
    def write_from_pcollection(self, pcoll_examples):
        import apache_beam as beam

        # create some metadata that will be used in .finalize()
        num_examples = (
            pcoll_examples
            | "Add metadata key" >> beam.Map(lambda v: ("num_examples", v))
            | "Count" >> beam.CombinePerKey(
                beam.transforms.combiners.CountCombineFn()))

        def save_metatada(metadata_items):
            with open(self._path + ".json", "w") as metadata_file:
                json.dump(metadata_items, metadata_file)

        # save metadata
        _ = ((num_examples, )
             | "Merge pcollections" >> beam.Flatten()
             | "Create Dict" >> beam.transforms.combiners.ToDict()
             | "Save metadata" >> beam.ParDo(save_metatada))

        # save dataset
        return (
            pcoll_examples
            | "Get values" >> beam.Values()
            | "Save to parquet" >> beam.io.parquetio.WriteToParquet(
                self._path, self._schema, num_shards=1, shard_name_template="")
        )
Example #5
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating ExplorationOpportunitySummaryModel.
        """

        topics = (self.pipeline
                  | 'Get all non-deleted topic models' >> (ndb_io.GetModels(
                      topic_models.TopicModel.get_all(include_deleted=False)))
                  | 'Get topic from model' >> beam.Map(
                      topic_fetchers.get_topic_from_model))

        story_ids_to_story = (
            self.pipeline
            | 'Get all non-deleted story models' >> ndb_io.GetModels(
                story_models.StoryModel.get_all(include_deleted=False))
            | 'Get story from model' >> beam.Map(
                story_fetchers.get_story_from_model)
            | 'Combine stories and ids' >> beam.Map(lambda story:
                                                    (story.id, story)))

        exp_ids_to_exp = (
            self.pipeline
            | 'Get all non-deleted exp models' >> ndb_io.GetModels(
                exp_models.ExplorationModel.get_all(include_deleted=False))
            | 'Get exploration from model' >> beam.Map(
                exp_fetchers.get_exploration_from_model)
            | 'Combine exploration and ids' >> beam.Map(lambda exp:
                                                        (exp.id, exp)))

        stories_dict = beam.pvalue.AsDict(story_ids_to_story)
        exps_dict = beam.pvalue.AsDict(exp_ids_to_exp)

        opportunities_results = (
            topics
            | beam.Map(self._generate_opportunities_related_to_topic,
                       stories_dict=stories_dict,
                       exps_dict=exps_dict))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with SUCCESS status' >>
            beam.Filter(lambda result: result.is_ok())
            | 'Fetch the models to be put' >>
            beam.FlatMap(lambda result: result.unwrap())
            | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id)  # pylint: disable=no-value-for-parameter
            | 'Allow only one item per key' >>
            (beam.combiners.Sample.FixedSizePerKey(1))
            | 'Remove the IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            |
            'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x)
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Count the output' >>
                (job_result_transforms.ResultsToJobRunResults()))
Example #6
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        user_settings_models = (
            self.pipeline
            | 'Get all UserSettingsModels' >>
            (ndb_io.GetModels(user_models.UserSettingsModel.get_all())))

        old_user_stats_models = (
            self.pipeline
            | 'Get all UserStatsModels' >>
            (ndb_io.GetModels(user_models.UserStatsModel.get_all())))

        # Creates UserStatsModels if it does not exists.
        new_user_stats_models = (
            (user_settings_models, old_user_stats_models)
            | 'Merge models' >> beam.Flatten()
            # Returns a PCollection of
            # (model.id, (user_settings_models, user_stats_models)) or
            # (model.id, (user_settings_models,)).
            | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id)
            # Discards model.id from the PCollection.
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # Only keep groupings that indicate that
            # the UserStatsModel is missing.
            | 'Filter pairs of models' >>
            beam.Filter(lambda models: (len(list(models)) == 1 and isinstance(
                list(models)[0], user_models.UserSettingsModel)))
            # Choosing the first element.
            | 'Transform tuples into models' >>
            beam.Map(lambda models: list(models)[0])
            # Creates the missing UserStatsModels.
            | 'Create new user stat models' >> beam.ParDo(
                CreateUserStatsModel()))

        unused_put_result = (
            (new_user_stats_models, old_user_stats_models)
            | 'Merge new and old models together' >> beam.Flatten()
            | 'Update the dashboard stats' >> beam.ParDo(
                UpdateWeeklyCreatorStats())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        new_user_stats_job_result = (
            new_user_stats_models
            | 'Count all new models' >> beam.combiners.Count.Globally()
            | 'Only create result for new models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for new models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS NEW %s' % x)))
        old_user_stats_job_result = (
            old_user_stats_models
            | 'Count all old models' >> beam.combiners.Count.Globally()
            | 'Only create result for old models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for old models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS OLD %s' % x)))

        return ((new_user_stats_job_result, old_user_stats_job_result)
                | 'Merge new and old results together' >> beam.Flatten())
Example #7
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        new_user_stats_models = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        unused_put_result = (
            new_user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (new_user_stats_models
                | 'Count all new models' >>
                (beam.combiners.Count.Globally().without_defaults())
                | 'Only create result for new models when > 0' >>
                (beam.Filter(lambda x: x > 0))
                | 'Create result for new models' >>
                beam.Map(lambda x: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % x)))
Example #8
0
def find_duplicates_fn(inputs: _PCollection) -> _DoOutputsTuple:
    return (inputs \
        | 'WithSymbolAndDataTypeAsKey' >>
            beam.Map(lambda d: ((d.symbol, d.data_type), d)) \
        | 'GroupByKey' >> beam.GroupByKey() \
        | 'Values' >> beam.Values() \
        | "FindDuplicates" >> beam.FlatMap(find_duplicates).with_outputs(
            'duplicates_with_different_values',
            main='safe_to_delete_duplicate_ids'))
Example #9
0
def produce_calc_fn(inputs: PCollection, calc_producer: calc.CalcProducer,
                    inputs_shape: calc.CalcInputs) -> PCollection:
    return (inputs \
        | 'FilterByInputsShape' >> beam.Filter(filter_by_input_shapes,
                                               inputs_shape) \
        | 'WithSymbolAsKey' >> beam.Map(lambda d: (d.symbol, d)) \
        | 'GroupByKey' >> beam.GroupByKey() \
        | 'Values' >> beam.Values() \
        | 'PerformCalc' >> beam.FlatMap(perform_calc, inputs_shape,
                                        calc_producer))
Example #10
0
 def expand(self, p):
     avg_score = (p
             | beam.Values()
             | beam.CombineGlobally(
                 beam.combiners.MeanCombineFn()).as_singleton_view()
             )
     return (p
             | 'compute_spammers' >> beam.ParDo(
                 FilterUser(self.score_weight), avg_score=avg_score)
             )
 def expand(self, blog_model_pcoll):
     return (
         blog_model_pcoll
         | 'Discard models with empty property value' >>
         (beam.Filter(lambda model: self.get_property_value(model) != ''))
         | 'Generate (%s, model) key value pairs' % self._property_name >>
         (beam.WithKeys(self.get_property_value))  # pylint: disable=no-value-for-parameter
         | 'Group pairs by their %s' % self._property_name >>
         (beam.GroupByKey())
         | 'Discard %s key' % self._property_name >> beam.Values()  # pylint: disable=no-value-for-parameter
         | 'Discard models with unique %s' % self._property_name >>
         (beam.Filter(lambda models: len(models) > 1)))
Example #12
0
def write_proto_outputs(output_file, name, data, proto_message):
    """Write protos to a container."""
    if output_file.endswith((".txtpb.gz", ".txtpb")):
        _ = (data
             | "DropKey_%s" % name >> beam.Values()
             | "ToTextProto" % name >> beam.Map(
                 _proto_to_text,
                 proto_message=proto_message,
             )
             |
             "WriteTextExamples_%s" % name >> beam.io.WriteToText(output_file))
        return
    elif output_file.endswith(".tfrecord"):
        _ = (data
             | "DropKey_%s" % name >> beam.Values()
             | "WriteTFRecordsExamples_%s" % name >> beam.io.WriteToTFRecord(
                 file_path_prefix=output_file,
                 shard_name_template="",
                 coder=beam.coders.ProtoCoder(proto_message)))
        return
    raise ValueError(f"Unsupported output format: {output_file}")
Example #13
0
  def test_custormized_counters_in_combine_fn(self):
    p = TestPipeline()
    input = (
        p
        | beam.Create([('key1', 'a'), ('key1', 'ab'), ('key1', 'abc'),
                       ('key2', 'uvxy'), ('key2', 'uvxyz')]))

    # The result of concatenating all values regardless of key.
    global_concat = (
        input
        | beam.Values()
        | beam.CombineGlobally(SortedConcatWithCounters()))

    # The (key, concatenated_string) pairs for all keys.
    concat_per_key = (input | beam.CombinePerKey(SortedConcatWithCounters()))

    # Verify the concatenated strings are correct.
    expected_concat_per_key = [('key1', 'aaabbc'), ('key2', 'uuvvxxyyz')]
    assert_that(
        global_concat, equal_to(['aaabbcuuvvxxyyz']), label='global concat')
    assert_that(
        concat_per_key,
        equal_to(expected_concat_per_key),
        label='concat per key')

    result = p.run()
    result.wait_until_finish()

    # Verify the values of metrics are correct.
    word_counter_filter = MetricsFilter().with_name('word_counter')
    query_result = result.metrics().query(word_counter_filter)
    if query_result['counters']:
      word_counter = query_result['counters'][0]
      self.assertEqual(word_counter.result, 5)

    word_lengths_filter = MetricsFilter().with_name('word_lengths')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['counters']:
      word_lengths = query_result['counters'][0]
      self.assertEqual(word_lengths.result, 15)

    word_len_dist_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_len_dist_filter)
    if query_result['distributions']:
      word_len_dist = query_result['distributions'][0]
      self.assertEqual(word_len_dist.result.mean, 3)

    last_word_len_filter = MetricsFilter().with_name('last_word_len')
    query_result = result.metrics().query(last_word_len_filter)
    if query_result['gauges']:
      last_word_len = query_result['gauges'][0]
      self.assertIn(last_word_len.result.value, [1, 2, 3, 4, 5])
Example #14
0
def downsample(pcoll, n, random_seed):
    """Deterministic PCollection downsampling using sha256."""

    return (pcoll
            | "HashKey" >>
            beam.Map(lambda x:
                     (str(hash_key(x[0], random_seed)), x)).with_output_types(
                         Tuple[str, Tuple[str, Dict]])
            | "Downsample" >> beam.combiners.Top.Of(
                n, key=lambda x: x[0]).with_output_types(
                    List[Tuple[str, Tuple[str, Dict]]])
            | "Unpack" >> beam.FlatMap(lambda x: x)
            | "DropHashKey" >> beam.Values())
Example #15
0
    def test_values(self):
        expected = [
            'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
            'Saturday'
        ]

        inputs = [(0, 'Sunday'), (1, 'Monday'), (2, 'Tuesday'),
                  (3, 'Wednesday'), (4, 'Thursday'), (5, 'Friday'),
                  (6, 'Saturday')]

        with TestPipeline() as p:
            actual = (p | beam.Create(inputs) | beam.Values())

            assert_that(actual, equal_to(expected))
Example #16
0
    def _run_sampling(self, example_uris: Mapping[Text, Text], to_key_fn: Text,
                      output_artifact: Artifact, samples_per_key: int) -> None:
        """Runs stratified sampling on given example data.
    Args:
      example_uris: Mapping of example split name to example uri.
      to_key_fn: function to convert an example to a key
      output_artifact: Output artifact.
      samples_per_key: number of examples to keep per value of the key.
    Returns:
      None
    """

        d = {}
        exec(to_key_fn, globals(), d)  # how ugly is that?
        to_key = d['to_key']

        def to_keyed_value(m):
            return to_key(m), m

        with self._make_beam_pipeline() as pipeline:
            for split_name, example_uri in example_uris.items():
                data_list = [
                    (pipeline | 'ReadData[{}]'.format(split_name) >>
                     beam.io.ReadFromTFRecord(
                         file_pattern=io_utils.all_files_pattern(example_uri)))
                ]

                dest_path = os.path.join(
                    artifact_utils.get_split_uri([output_artifact],
                                                 split_name),
                    _STRATIFIED_EXAMPLES_FILE_PREFIX)

                _ = ([data for data in data_list]
                     | 'FlattenExamples ({})'.format(split_name) >>
                     beam.Flatten(pipeline=pipeline)
                     | 'ParseExamples ({})'.format(split_name) >> beam.Map(
                         tf.train.Example.FromString)
                     |
                     'Key ({})'.format(split_name) >> beam.Map(to_keyed_value)
                     | 'Sample per key ({})'.format(split_name) >>
                     beam.combiners.Sample.FixedSizePerKey(samples_per_key)
                     | 'Values ({})'.format(split_name) >> beam.Values()
                     | 'Flatten lists ({})'.format(split_name) >>
                     beam.FlatMap(lambda elements: elements)
                     | 'WriteStratifiedSamples ({})'.format(split_name) >>
                     beam.io.WriteToTFRecord(dest_path,
                                             file_name_suffix='.gz',
                                             coder=beam.coders.ProtoCoder(
                                                 tf.train.Example)))
                logging.info('Sampling result written to %s.', dest_path)
Example #17
0
 def expand(self, results):
     """Writes the given job results to the NDB datastore."""
     return (
         results
         # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
         # different signature than the one it's defined with.
         | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
         # GroupIntoBatches() requires (key, value) pairs as input, so we
         # give everything None keys and then immediately discard them.
         | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
         | beam.Values()
         | beam.FlatMap(job_run_result.JobRunResult.accumulate)
         | beam.Map(self.create_beam_job_run_result_model)
         | ndb_io.PutModels(self.datastoreio_stub))
  def expand(self, pcoll):
    """Estimates the user defined statistic."""

    return (
        pcoll
        | 'AssignExampleToPartition' >> beam.Map(
            _assign_to_partition, num_partitions=self._num_partitions)
        | 'GroupPartitionsIntoList' >> beam.CombinePerKey(
            beam.combiners.SampleCombineFn(self._max_examples_per_partition))
        | 'RemovePartitionKey' >> beam.Values()
        | 'BatchExamples' >> beam.Map(batch_util.merge_single_batch)
        | 'ComputeStatsFn' >> beam.Map(self._stats_fn.compute)
        | 'ComputeMetaStats' >> beam.CombineGlobally(
            PartitionedStatisticsAnalyzer(min_partitions_stat_presence=self
                                          ._min_partitions_stat_presence)))
Example #19
0
  def expand(self, uri_to_content):

    # Compute the total number of documents, and prepare a singleton
    # PCollection to use as side input.
    total_documents = (
        uri_to_content
        | 'GetUris 1' >> beam.Keys()
        | 'GetUniqueUris' >> beam.RemoveDuplicates()
        | 'CountUris' >> beam.combiners.Count.Globally())

    # Create a collection of pairs mapping a URI to each of the words
    # in the document associated with that that URI.

    def split_into_words((uri, line)):
      return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]

    uri_to_words = (
        uri_to_content
        | 'SplitWords' >> beam.FlatMap(split_into_words))

    # Compute a mapping from each word to the total number of documents
    # in which it appears.
    word_to_doc_count = (
        uri_to_words
        | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates()
        | 'GetWords' >> beam.Values()
        | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())

    # Compute a mapping from each URI to the total number of words in the
    # document associated with that URI.
    uri_to_word_total = (
        uri_to_words
        | 'GetUris 2' >> beam.Keys()
        | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())

    # Count, for each (URI, word) pair, the number of occurrences of that word
    # in the document associated with the URI.
    uri_and_word_to_count = (
        uri_to_words
        | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())

    # Adjust the above collection to a mapping from (URI, word) pairs to counts
    # into an isomorphic mapping from URI to (word, count) pairs, to prepare
    # for a join by the URI key.
    uri_to_word_and_count = (
        uri_and_word_to_count
        | 'ShiftKeys' >> beam.Map(
            lambda ((uri, word), count): (uri, (word, count))))
Example #20
0
  def _WriteExamples(pcoll: beam.pvalue.PCollection,
                     transformed_example_path: Text) -> beam.pvalue.PDone:
    """Writes transformed examples compressed in gzip format.

    Args:
      pcoll: PCollection of serialized transformed examples.
      transformed_example_path: path to write to.

    Returns:
      beam.pvalue.PDone.
    """
    return (
        pcoll
        | 'Values' >> beam.Values()
        | 'Write' >> beam.io.WriteToTFRecord(
            transformed_example_path, file_name_suffix='.gz'))
Example #21
0
    def create_test_pipeline(
        self, entry_stats: StatsType
    ) -> beam.PCollection[Dict[str, Union[int, Set[datetime.date]]]]:
        """Creates testing pipeline with some entry stats.

        Args:
            entry_stats: StatsType. The stast with which to start the pipeline.

        Returns:
            PCollection. The testing pipeline to be executed.
        """
        return (self.pipeline
                | beam.Create(entry_stats)
                | beam.CombineValues(cron_jobs.CombineStats())
                | beam.Values()  # pylint: disable=no-value-for-parameter
                | beam.Map(lambda stats: stats.to_dict()))
Example #22
0
def _MutualInformationTransformMerge(  # pylint: disable=invalid-name
        pcol, use_adjusted_mutual_info, min_diff_from_avg):
    """Computes mutual information for each key using the given accumulators."""
    feature_accumulator_pcol = (pcol | 'VocabCountPerLabelPerTokenMerge' >>
                                beam.CombinePerKey(_WeightedMeanCombineFn()))

    global_accumulator = (feature_accumulator_pcol
                          | 'DropKeys' >> beam.Values()
                          | 'VocabCountPerLabelGlobally' >>
                          beam.CombineGlobally(_WeightedMeanCombineFn()))

    return (feature_accumulator_pcol
            | 'CalculateMutualInformationPerToken' >> beam.Map(
                _calculate_mutual_information_for_binary_feature,
                beam.pvalue.AsSingleton(global_accumulator),
                use_adjusted_mutual_info=use_adjusted_mutual_info,
                min_diff_from_avg=min_diff_from_avg))
Example #23
0
    def expand(self, pcoll):
        field_names = []
        for field_descriptor in dataset_pb2.BondTopologySummary.DESCRIPTOR.fields:
            if field_descriptor.name.startswith('count_'):
                field_names.append(field_descriptor.name)

        return (pcoll
                | 'CombineByBTID' >> beam.CombinePerKey(
                    merge_bond_topology_summaries, field_names=field_names)
                | 'DropBTID' >> beam.Values()
                | 'Reshuffle' >> beam.Reshuffle()
                | 'CSVFormat' >> beam.Map(csv_format_bond_topology_summary,
                                          field_names=field_names)
                | 'WriteCSV' >> beam.io.WriteToText(
                    FLAGS.output_stem + '_bt_summary',
                    header='bt_id,' + ','.join(field_names),
                    num_shards=1,
                    file_name_suffix='.csv'))
Example #24
0
def values(test=None):
    # [START values]
    import apache_beam as beam

    with beam.Pipeline() as pipeline:
        plants = (pipeline
                  | 'Garden plants' >> beam.Create([
                      ('🍓', 'Strawberry'),
                      ('🥕', 'Carrot'),
                      ('🍆', 'Eggplant'),
                      ('🍅', 'Tomato'),
                      ('🥔', 'Potato'),
                  ])
                  | 'Values' >> beam.Values()
                  | beam.Map(print))
        # [END values]
        if test:
            test(plants)
Example #25
0
    def _WriteExamples(pcollection, unused_file_format,
                       transformed_example_path):
        """Writes transformed examples compressed in gzip format.

    Args:
      pcollection: PCollection of transformed examples.
      unused_file_format: file format, unused.
      transformed_example_path: path to write to.

    Returns:
      beam.pvalue.PDone.
    """
        return (pcollection
                | 'DropNoneKeys' >> beam.Values()
                | 'Write' >> beam.io.WriteToTFRecord(
                    transformed_example_path,
                    file_name_suffix='.gz',
                    coder=beam.coders.ProtoCoder(example_pb2.Example)))
    def _pipeline(root):

        interactions = (
            create_data.read_interactions(root, inputs, name="input")
            | "DropKey" >> beam.Map(beam.Values())
            |
            "ToRetrievalExample" >> beam.FlatMap(_to_retrieval_interaction_fn)
            | "Reshuffle" >> beam.transforms.util.Reshuffle())

        # We expect ~37,568,664 interactions by taking 1 / 5000 for test test we
        # get a reasonable test set size of ~7513.
        beam_utils.split_by_table_id_and_write(
            interactions,
            output_dir,
            train_suffix="@*",
            test_suffix="@*",
            num_splits=5000,
        )
Example #27
0
    def write_from_pcollection(self, pcoll_examples):
        """Add the final steps of the beam pipeline: write to parquet files."""
        import apache_beam as beam
        from .utils.beam_utils import WriteToParquet

        def inc_num_examples(example):
            beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()

        # count examples
        _ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)

        # save dataset
        return (pcoll_examples
                | "Get values" >> beam.Values()
                | "Save to parquet" >> WriteToParquet(self._parquet_path,
                                                      self._schema,
                                                      num_shards=1,
                                                      shard_name_template=""))
Example #28
0
  def test_MeanCombineFn_combine(self):
    with TestPipeline() as p:
      input = (
          p
          | beam.Create([('a', 1), ('a', 1), ('a', 4), ('b', 1), ('b', 13)]))
      # The mean of all values regardless of key.
      global_mean = (
          input
          | beam.Values()
          | beam.CombineGlobally(combine.MeanCombineFn()))

      # The (key, mean) pairs for all keys.
      mean_per_key = (input | beam.CombinePerKey(combine.MeanCombineFn()))

      expected_mean_per_key = [('a', 2), ('b', 7)]
      assert_that(global_mean, equal_to([4]), label='global mean')
      assert_that(
          mean_per_key, equal_to(expected_mean_per_key), label='mean per key')
Example #29
0
  def test_custormized_counters_in_combine_fn_empty(self):
    p = TestPipeline()
    input = p | beam.Create([])

    # The result of concatenating all values regardless of key.
    global_concat = (
        input
        | beam.Values()
        | beam.CombineGlobally(SortedConcatWithCounters()))

    # The (key, concatenated_string) pairs for all keys.
    concat_per_key = (input | beam.CombinePerKey(SortedConcatWithCounters()))

    # Verify the concatenated strings are correct.
    assert_that(global_concat, equal_to(['']), label='global concat')
    assert_that(concat_per_key, equal_to([]), label='concat per key')

    result = p.run()
    result.wait_until_finish()

    # Verify the values of metrics are correct.
    word_counter_filter = MetricsFilter().with_name('word_counter')
    query_result = result.metrics().query(word_counter_filter)
    if query_result['counters']:
      word_counter = query_result['counters'][0]
      self.assertEqual(word_counter.result, 0)

    word_lengths_filter = MetricsFilter().with_name('word_lengths')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['counters']:
      word_lengths = query_result['counters'][0]
      self.assertEqual(word_lengths.result, 0)

    word_len_dist_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_len_dist_filter)
    if query_result['distributions']:
      word_len_dist = query_result['distributions'][0]
      self.assertEqual(word_len_dist.result.count, 0)

    last_word_len_filter = MetricsFilter().with_name('last_word_len')
    query_result = result.metrics().query(last_word_len_filter)

    # No element has ever been recorded.
    self.assertFalse(query_result['gauges'])
Example #30
0
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \
        -> beam.pvalue.PCollection:
    """Gets enriched events by
        a) Call a transform that combining primary event with corresponding side input values
        b) Group events by dummy key to combine all events in a window into one shard
        c) Discard dummy key

     Args:
        salesevent: Event representing sales transaction
        sideinput_collections: Set of Side Input Collections
    """
    # yapf: disable
    return (salesevent
             | "Enrich event" >> beam.Map(transforms.enrich_event,
                                       AsDict(sideinput_collections["bonuspoints"]),
                                       AsDict(sideinput_collections["discountpct"]),
                                       AsDict(sideinput_collections["category"]))
             | "Group events by dummy Key" >> beam.GroupByKey()
             | "Discard dummy Key" >> beam.Values()
          )