Beispiel #1
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:

        collection_pairs = (
            self.pipeline
            | 'get collection models ' >> ndb_io.GetModels(
                collection_models.CollectionRightsModel.get_all())
            | 'Flatten owner_ids and format' >> beam.FlatMap(
                self._extract_user_and_collection_ids))

        user_pairs = (self.pipeline
                      | 'Get all user settings models' >> ndb_io.GetModels(
                          user_models.UserSettingsModel.get_all())
                      | 'Extract id and email' >>
                      beam.Map(lambda user_setting:
                               (user_setting.id, user_setting.email)))

        collection_ids_to_email_mapping = (
            (collection_pairs, user_pairs)
            | 'Group by user_id' >> beam.CoGroupByKey()
            | 'Drop user id' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Filter out results without any collection' >>
            beam.Filter(lambda collection_ids_and_email: len(
                collection_ids_and_email[0]) > 0))

        return (
            collection_ids_to_email_mapping
            | 'Get final result' >>
            beam.MapTuple(lambda collection, email: job_run_result.JobRunResult
                          .as_stdout('collection_ids: %s, email: %s' %
                                     (collection, email))))
Beispiel #2
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        user_settings_models = (
            self.pipeline
            | 'Get all UserSettingsModels' >>
            (ndb_io.GetModels(user_models.UserSettingsModel.get_all())))

        old_user_stats_models = (
            self.pipeline
            | 'Get all UserStatsModels' >>
            (ndb_io.GetModels(user_models.UserStatsModel.get_all())))

        # Creates UserStatsModels if it does not exists.
        new_user_stats_models = (
            (user_settings_models, old_user_stats_models)
            | 'Merge models' >> beam.Flatten()
            # Returns a PCollection of
            # (model.id, (user_settings_models, user_stats_models)) or
            # (model.id, (user_settings_models,)).
            | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id)
            # Discards model.id from the PCollection.
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # Only keep groupings that indicate that
            # the UserStatsModel is missing.
            | 'Filter pairs of models' >>
            beam.Filter(lambda models: (len(list(models)) == 1 and isinstance(
                list(models)[0], user_models.UserSettingsModel)))
            # Choosing the first element.
            | 'Transform tuples into models' >>
            beam.Map(lambda models: list(models)[0])
            # Creates the missing UserStatsModels.
            | 'Create new user stat models' >> beam.ParDo(
                CreateUserStatsModel()))

        unused_put_result = (
            (new_user_stats_models, old_user_stats_models)
            | 'Merge new and old models together' >> beam.Flatten()
            | 'Update the dashboard stats' >> beam.ParDo(
                UpdateWeeklyCreatorStats())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        new_user_stats_job_result = (
            new_user_stats_models
            | 'Count all new models' >> beam.combiners.Count.Globally()
            | 'Only create result for new models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for new models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS NEW %s' % x)))
        old_user_stats_job_result = (
            old_user_stats_models
            | 'Count all old models' >> beam.combiners.Count.Globally()
            | 'Only create result for old models when > 0' >>
            (beam.Filter(lambda x: x > 0))
            | 'Create result for old models' >>
            beam.Map(lambda x: job_run_result.JobRunResult(
                stdout='SUCCESS OLD %s' % x)))

        return ((new_user_stats_job_result, old_user_stats_job_result)
                | 'Merge new and old results together' >> beam.Flatten())
Beispiel #3
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating ExplorationOpportunitySummaryModel.
        """

        topics = (self.pipeline
                  | 'Get all non-deleted topic models' >> (ndb_io.GetModels(
                      topic_models.TopicModel.get_all(include_deleted=False)))
                  | 'Get topic from model' >> beam.Map(
                      topic_fetchers.get_topic_from_model))

        story_ids_to_story = (
            self.pipeline
            | 'Get all non-deleted story models' >> ndb_io.GetModels(
                story_models.StoryModel.get_all(include_deleted=False))
            | 'Get story from model' >> beam.Map(
                story_fetchers.get_story_from_model)
            | 'Combine stories and ids' >> beam.Map(lambda story:
                                                    (story.id, story)))

        exp_ids_to_exp = (
            self.pipeline
            | 'Get all non-deleted exp models' >> ndb_io.GetModels(
                exp_models.ExplorationModel.get_all(include_deleted=False))
            | 'Get exploration from model' >> beam.Map(
                exp_fetchers.get_exploration_from_model)
            | 'Combine exploration and ids' >> beam.Map(lambda exp:
                                                        (exp.id, exp)))

        stories_dict = beam.pvalue.AsDict(story_ids_to_story)
        exps_dict = beam.pvalue.AsDict(exp_ids_to_exp)

        opportunities_results = (
            topics
            | beam.Map(self._generate_opportunities_related_to_topic,
                       stories_dict=stories_dict,
                       exps_dict=exps_dict))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with SUCCESS status' >>
            beam.Filter(lambda result: result.is_ok())
            | 'Fetch the models to be put' >>
            beam.FlatMap(lambda result: result.unwrap())
            | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id)  # pylint: disable=no-value-for-parameter
            | 'Allow only one item per key' >>
            (beam.combiners.Sample.FixedSizePerKey(1))
            | 'Remove the IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            |
            'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x)
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Count the output' >>
                (job_result_transforms.ResultsToJobRunResults()))
Beispiel #4
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        new_user_stats_models = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        unused_put_result = (
            new_user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (new_user_stats_models
                | 'Count all new models' >>
                (beam.combiners.Count.Globally().without_defaults())
                | 'Only create result for new models when > 0' >>
                (beam.Filter(lambda x: x > 0))
                | 'Create result for new models' >>
                beam.Map(lambda x: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % x)))
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating ExplorationOpportunitySummaryModel.
        """

        topics = (self.pipeline
                  | 'Get all non-deleted topic models' >> (ndb_io.GetModels(
                      topic_models.TopicModel.get_all(include_deleted=False)))
                  | 'Get topic from model' >> beam.Map(
                      topic_fetchers.get_topic_from_model))

        story_ids_to_story = (
            self.pipeline
            | 'Get all non-deleted story models' >> ndb_io.GetModels(
                story_models.StoryModel.get_all(include_deleted=False))
            | 'Get story from model' >> beam.Map(
                story_fetchers.get_story_from_model)
            | 'Combine stories and ids' >> beam.Map(lambda story:
                                                    (story.id, story)))

        exp_ids_to_exp = (
            self.pipeline
            | 'Get all non-deleted exp models' >> ndb_io.GetModels(
                exp_models.ExplorationModel.get_all(include_deleted=False))
            | 'Get exploration from model' >> beam.Map(
                exp_fetchers.get_exploration_from_model)
            | 'Combine exploration and ids' >> beam.Map(lambda exp:
                                                        (exp.id, exp)))

        stories_dict = beam.pvalue.AsDict(story_ids_to_story)
        exps_dict = beam.pvalue.AsDict(exp_ids_to_exp)

        opportunities_results = (
            topics
            | beam.Map(self._generate_opportunities_related_to_topic,
                       stories_dict=stories_dict,
                       exps_dict=exps_dict))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with SUCCESS status' >>
            beam.Filter(lambda result: result['status'] == 'SUCCESS')
            | 'Fetch the models to be put' >>
            beam.FlatMap(lambda result: result['models'])
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Fetch the job results' >>
                beam.Map(lambda result: result['job_result']))
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        # Pcollection that returns the relevant config property with batch
        # index.
        config_property = (
            self.pipeline
            | 'Get all config properties' >> ndb_io.GetModels(
                config_models.ConfigPropertyModel.get_all())
            | 'Get the batch_index_for_mailchimp property value' >> beam.Filter(
                lambda model: model.id == 'batch_index_for_mailchimp')
            | 'Get value' >> beam.Map(lambda model: model.value)
        )

        batch_index_dict = beam.pvalue.AsSingleton(config_property)

        # PCollection with all user ids that have opted in for email
        # newsletters.
        relevant_user_ids = (
            self.pipeline
            | 'Get all UserEmailPreferencesModel' >> ndb_io.GetModels(
                user_models.UserEmailPreferencesModel.get_all().filter(
                    user_models.UserEmailPreferencesModel.site_updates == True # pylint: disable=singleton-comparison
                ))
            | 'Extract user ID' >> beam.Map(
                lambda preferences_model: preferences_model.id)
        )

        valid_user_ids = beam.pvalue.AsIter(relevant_user_ids)

        # PCollection of all user emails opted in for newsletters.
        relevant_user_emails = (
            self.pipeline
            | 'Get all user settings models' >> ndb_io.GetModels(
                user_models.UserSettingsModel.get_all())
            | 'Filter user models' >> (
                beam.Filter(
                    lambda model, ids: model.id in ids, ids=valid_user_ids))
            | 'Get email' >> (beam.Map(lambda model: model.email))
        )

        mailchimp_results = (
            relevant_user_emails
            # A large batch size is given so that all emails are included in a
            # single list.
            | 'Combine into a list' >> beam.CombineGlobally(CombineItems())
            | 'Send mailchimp request for current batch' >> beam.ParDo(
                SendBatchMailchimpRequest(), batch_index_dict=batch_index_dict,
                test_run=True)
            | 'Get final result' >> beam.Map(
                lambda result: job_run_result.JobRunResult.as_stdout(
                    result.value))
        )

        return mailchimp_results
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        deleting ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            deleting ExplorationOpportunitySummaryModel.
        """
        exp_opportunity_summary_model = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
        )

        unused_delete_result = (
            exp_opportunity_summary_model
            | beam.Map(lambda model: model.key)
            | 'Delete all models' >> ndb_io.DeleteModels()
        )

        return (
            exp_opportunity_summary_model
            | 'Create job run result' >> (
                job_result_transforms.CountObjectsToJobRunResult())
        )
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:

        exp_models_pcoll = (self.pipeline
                            | 'Get all ExplorationModels' >> ndb_io.GetModels(
                                exp_models.ExplorationModel.get_all()))

        exp_models_filtered = (exp_models_pcoll
                               | 'Filter Math ExplorationModels' >>
                               beam.Filter(self.contains_math_interactions))

        exp_models_with_states = (
            exp_models_filtered
            | 'Mapping exp_ids with states' >>
            (beam.FlatMap(self.flat_map_exp_with_states)))

        exp_models_with_states_filtered = (
            exp_models_with_states
            | 'Filtering out states without math interactions' >>
            (beam.Filter(lambda tup: tup[2]['interaction']['id'] in feconf.
                         MATH_INTERACTION_IDS)))

        exp_models_with_states_and_rules = (
            exp_models_with_states_filtered
            | 'Mapping with rule types list' >>
            (beam.Map(self.map_with_rule_types)))

        return (
            exp_models_with_states_and_rules
            |
            'Final output' >> beam.Map(job_run_result.JobRunResult.as_stdout))
Beispiel #9
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        deleting ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            deleting ExplorationOpportunitySummaryModel.
        """
        exp_opportunity_summary_model = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False)))

        unused_delete_result = (exp_opportunity_summary_model
                                | beam.Map(lambda model: model.key)
                                | 'Delete all models' >> ndb_io.DeleteModels())

        return (exp_opportunity_summary_model
                | 'Count all new models' >> beam.combiners.Count.Globally()
                | 'Only create result for new models when > 0' >>
                (beam.Filter(lambda n: n > 0))
                | 'Create result for new models' >>
                beam.Map(lambda n: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % n)))
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        the Elastic Search.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            the Elastic Search.
        """

        exp_summary_models = (
            self.pipeline
            | 'Get all non-deleted models' >>
            (ndb_io.GetModels(exp_models.ExpSummaryModel.get_all())))

        exp_summary_iter = beam.pvalue.AsIter(exp_summary_models)

        exp_recommendations_models = (
            exp_summary_models
            | 'Compute similarity' >> beam.ParDo(ComputeSimilarity(),
                                                 exp_summary_iter)
            | 'Group similarities per exploration ID' >> beam.GroupByKey()
            | 'Sort and slice similarities' >> beam.MapTuple(
                lambda exp_id, similarities:
                (exp_id, self._sort_and_slice_similarities(similarities)))
            | 'Create recommendation models' >> beam.MapTuple(
                self._create_recommendation))

        unused_put_result = (
            exp_recommendations_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (exp_recommendations_models
                | 'Create job run result' >>
                (job_result_transforms.CountObjectsToJobRunResult()))
 def run(self) -> beam.Pipeline:
     return (self.pipeline
             | 'Get every Blog Summary Model' >>
             (ndb_io.GetModels(blog_models.BlogPostSummaryModel.query()))
             | GetModelsWithDuplicatePropertyValues('title')
             | 'Flatten models into a list of errors' >>
             beam.FlatMap(lambda models: [
                 blog_validation_errors.DuplicateBlogTitleError(model)
                 for model in models
             ]))
 def run(
     self
 ) -> beam.PCollection[blog_validation_errors.DuplicateBlogUrlError]:
     return (self.pipeline
             | 'Get every Blog Post Model' >>
             (ndb_io.GetModels(blog_models.BlogPostModel.query()))
             | GetModelsWithDuplicatePropertyValues('url_fragment')
             | 'Flatten models into a list of errors' >>
             beam.FlatMap(lambda models: [
                 blog_validation_errors.DuplicateBlogUrlError(model)
                 for model in models
             ]))
Beispiel #13
0
    def test_read_from_datastore(self) -> None:
        model_list = [
            self.create_model(base_models.BaseModel, id='a'),
            self.create_model(base_models.BaseModel, id='b'),
            self.create_model(base_models.BaseModel, id='c'),
        ]
        self.put_multi(model_list)

        self.assertItemsEqual(self.get_base_models(),
                              model_list)  # type: ignore[no-untyped-call]

        model_pcoll = (self.pipeline
                       | ndb_io.GetModels(base_models.BaseModel.get_all()))

        self.assert_pcoll_equal(model_pcoll, model_list)
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        the Elastic Search.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            the Elastic Search.
        """
        return (
            self.pipeline
            | 'Get all non-deleted models' >> (ndb_io.GetModels(
                exp_models.ExpSummaryModel.get_all(include_deleted=False)))
            |
            'Split models into batches' >> beam.transforms.util.BatchElements(
                max_batch_size=self.MAX_BATCH_SIZE)
            | 'Index batches of models' >> beam.ParDo(
                IndexExplorationSummaries()))
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns PCollection of invalid explorations with their id and
        actual length.

        Returns:
            PCollection. Returns PCollection of invalid explorations with
            their id and actual length.
        """
        total_explorations = (
            self.pipeline
            | 'Get all ExplorationModels' >> ndb_io.GetModels(
                exp_models.ExplorationModel.get_all(include_deleted=False))
            | 'Get exploration from model' >> beam.Map(
                exp_fetchers.get_exploration_from_model))

        exp_ids_with_exceeding_max_title_len = (
            total_explorations
            | 'Combine exploration title and ids' >>
            beam.Map(lambda exp: (exp.id, exp.title))
            | 'Filter exploraton with title length greater than 36' >>
            beam.Filter(lambda exp: len(exp[1]) > 36))

        report_number_of_exps_queried = (
            total_explorations
            | 'Report count of exp models' >>
            (job_result_transforms.CountObjectsToJobRunResult('EXPS')))

        report_number_of_invalid_exps = (
            exp_ids_with_exceeding_max_title_len
            | 'Report count of invalid exp models' >>
            (job_result_transforms.CountObjectsToJobRunResult('INVALID')))

        report_invalid_ids_and_their_actual_len = (
            exp_ids_with_exceeding_max_title_len
            | 'Save info on invalid exps' >>
            beam.Map(lambda objects: job_run_result.JobRunResult.
                     as_stderr('The id of exp is %s and its actual len is %s' %
                               (objects[0], len(objects[1])))))

        return ((report_number_of_exps_queried, report_number_of_invalid_exps,
                 report_invalid_ids_and_their_actual_len)
                | 'Combine results' >> beam.Flatten())
Beispiel #16
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        matching entity_type as collection.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            matching entity_type as collection.
        """
        feedback_model_matched_as_collection = (
            self.pipeline
            | 'Get all GeneralFeedbackThread models' >> ndb_io.GetModels(
                feedback_models.GeneralFeedbackThreadModel.get_all())
            | 'Extract entity_type' >>
            beam.Map(lambda feeback_model: feeback_model.entity_type)
            | 'Match entity_type' >>
            beam.Filter(lambda entity_type: entity_type == 'collection'))

        return (feedback_model_matched_as_collection
                | 'Count the output' >>
                (job_result_transforms.CountObjectsToJobRunResult()))
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Generates the translation contributins stats.

        Returns:
            PCollection. A PCollection of 'SUCCESS x' results, where x is
            the number of generated stats..
        """
        suggestions_grouped_by_target = (
            self.pipeline
            | 'Get all non-deleted suggestion models' >> ndb_io.GetModels(
                suggestion_models.GeneralSuggestionModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Filter translate suggestions' >> beam.Filter(lambda m: (
                m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT))
            | 'Transform to suggestion domain object' >> beam.Map(
                suggestion_services.get_suggestion_from_model)
            | 'Group by target' >> beam.GroupBy(lambda m: m.target_id))
        exp_opportunities = (
            self.pipeline
            | 'Get all non-deleted opportunity models' >> ndb_io.GetModels(
                opportunity_models.ExplorationOpportunitySummaryModel.get_all(
                    include_deleted=False))
            # We need to window the models so that CoGroupByKey below
            # works properly.
            | 'Transform to opportunity domain object' >>
            beam.Map(opportunity_services.
                     get_exploration_opportunity_summary_from_model)
            | 'Group by ID' >> beam.GroupBy(lambda m: m.id))

        user_stats_results = (
            {
                'suggestion': suggestions_grouped_by_target,
                'opportunity': exp_opportunities
            }
            | 'Merge models' >> beam.CoGroupByKey()
            | 'Get rid of key' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats(
                x['suggestion'][0] if len(x['suggestion']) else [],
                list(x['opportunity'][0])[0]
                if len(x['opportunity']) else None)))

        user_stats_models = (
            user_stats_results
            | 'Filter ok results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_ok())
            | 'Unpack result' >> beam.MapTuple(lambda key, result:
                                               (key, result.unwrap()))
            | 'Combine the stats' >> beam.CombinePerKey(CombineStats())
            | 'Generate models from stats' >> beam.MapTuple(
                self._generate_translation_contribution_model))

        user_stats_error_job_run_results = (
            user_stats_results
            | 'Filter err results' >>
            beam.Filter(lambda key_and_result: key_and_result[1].is_err())
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove keys' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Transform result to job run result' >>
            (job_result_transforms.ResultsToJobRunResults()))

        unused_put_result = (
            user_stats_models
            | 'Put models into the datastore' >> ndb_io.PutModels())

        user_stats_models_job_run_results = (
            user_stats_models
            | 'Create job run result' >>
            (job_result_transforms.CountObjectsToJobRunResult()))

        return ((user_stats_error_job_run_results,
                 user_stats_models_job_run_results)
                | 'Merge job run results' >> beam.Flatten())
Beispiel #18
0
    def run(self):
        """Returns a PCollection of audit errors aggregated from all models.

        Returns:
            PCollection. A PCollection of audit errors discovered during the
            audit.
        """
        existing_models, deleted_models = (
            self.pipeline
            | 'Get all models' >>
            (ndb_io.GetModels(datastore_services.query_everything()))
            | 'Partition by model.deleted' >>
            (beam.Partition(lambda model, _: int(model.deleted), 2)))

        models_of_kind_by_index = (
            existing_models
            # NOTE: Partition returns a statically-sized list of PCollections.
            # Creating partitions is wasteful when there are fewer items than
            # there are partitions, like in our unit tests. In exchange, in
            # production the job will be able to take advantage of the high
            # parallelizability of PCollections, which are designed for enormous
            # datasets and parallel processing.
            #
            # Alternatively, we could have used GroupBy. However, that returns
            # an _iterable_ of items rather than a PCollection, and so it is
            # vulnerable to out-of-memory errors.
            #
            # Since this job is concerned with running audits on EVERY MODEL IN
            # STORAGE, Partition is the clear winner regardless of the overhead
            # we'll see in unit tests.
            |
            'Split models into parallelizable PCollections' >> beam.Partition(
                lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)),
                # NOTE: Partition requires a hard-coded number of slices; it
                # cannot be used with dynamic numbers generated in a pipeline.
                # KIND_BY_INDEX is a constant tuple so that requirement is
                # satisfied in this case.
                len(KIND_BY_INDEX),
                KIND_BY_INDEX))

        existing_key_count_pcolls = []
        missing_key_error_pcolls = []
        audit_error_pcolls = [
            deleted_models
            | 'Apply ValidateDeletedModel on deleted models' >>
            (beam.ParDo(base_validation.ValidateDeletedModel()))
        ]

        model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index)
        for kind, models_of_kind in model_groups:
            audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind))

            if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES:
                existing_key_count_pcolls.append(
                    models_of_kind | GetExistingModelKeyCounts(kind))

            if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR:
                missing_key_error_pcolls.extend(
                    models_of_kind | GetMissingModelKeyErrors(kind))

        existing_key_counts = (
            existing_key_count_pcolls
            | 'Flatten PCollections of existing key counts' >> beam.Flatten())
        missing_key_errors = (
            missing_key_error_pcolls
            | 'Flatten PCollections of missing key errors' >> beam.Flatten())
        audit_error_pcolls.append(
            (existing_key_counts, missing_key_errors)
            | 'Group counts and errors by key' >> beam.CoGroupByKey()
            | 'Filter keys without any errors' >>
            (beam.FlatMapTuple(self._get_model_relationship_errors)))

        return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
Beispiel #19
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the story migration.

        Returns:
            PCollection. A PCollection of results from the story migration.
        """

        unmigrated_story_models = (
            self.pipeline
            | 'Get all non-deleted story models' >> (
                ndb_io.GetModels(story_models.StoryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_model: story_model.id)
        )
        story_summary_models = (
            self.pipeline
            | 'Get all non-deleted story summary models' >> (
                ndb_io.GetModels(story_models.StorySummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_summary_model: story_summary_model.id)
        )
        topics = (
            self.pipeline
            | 'Get all non-deleted topic models' >> (
                ndb_io.GetModels(topic_models.TopicModel.get_all()))
            | 'Transform model into domain object' >> beam.Map(
                topic_fetchers.get_topic_from_model)
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda topic: topic.id)
        )
        topic_id_to_topic = beam.pvalue.AsDict(topics)

        migrated_story_results = (
            unmigrated_story_models
            | 'Transform and migrate model' >> beam.MapTuple(
                self._migrate_story, topic_id_to_topic=topic_id_to_topic)
        )
        migrated_stories = (
            migrated_story_results
            | 'Filter oks' >> beam.Filter(
                lambda result_item: result_item.is_ok())
            | 'Unwrap ok' >> beam.Map(
                lambda result_item: result_item.unwrap())
        )
        migrated_story_job_run_results = (
            migrated_story_results
            | 'Generate results for migration' >> (
                job_result_transforms.ResultsToJobRunResults('STORY PROCESSED'))
        )

        story_changes = (
            unmigrated_story_models
            | 'Generate story changes' >> beam.FlatMapTuple(
                self._generate_story_changes)
        )

        story_objects_list = (
            {
                'story_model': unmigrated_story_models,
                'story_summary_model': story_summary_models,
                'story': migrated_stories,
                'story_change': story_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated stories' >> beam.Filter(
                lambda x: len(x['story_change']) > 0 and len(x['story']) > 0)
            | 'Reorganize the story objects' >> beam.Map(lambda objects: {
                    'story_model': objects['story_model'][0],
                    'story_summary_model': objects['story_summary_model'][0],
                    'story': objects['story'][0],
                    'story_change': objects['story_change'][0]
                })
        )

        story_objects_list_job_run_results = (
            story_objects_list
            | 'Transform story objects into job run results' >> (
                job_result_transforms.CountObjectsToJobRunResult(
                    'STORY MIGRATED'))
        )

        cache_deletion_job_run_results = (
            story_objects_list
            | 'Delete story from cache' >> beam.Map(
                lambda story_objects: self._delete_story_from_cache(
                    story_objects['story']))
            | 'Generate results for cache deletion' >> (
                job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))
        )

        story_models_to_put = (
            story_objects_list
            | 'Generate story models to put' >> beam.FlatMap(
                lambda story_objects: self._update_story(
                    story_objects['story_model'],
                    story_objects['story'],
                    story_objects['story_change'],
                ))
        )

        story_summary_models_to_put = (
            story_objects_list
            | 'Generate story summary models to put' >> beam.Map(
                lambda story_objects: self._update_story_summary(
                    story_objects['story'],
                    story_objects['story_summary_model']
                ))
        )

        unused_put_results = (
            (story_models_to_put, story_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels()
        )

        return (
            (
                cache_deletion_job_run_results,
                migrated_story_job_run_results,
                story_objects_list_job_run_results
            )
            | beam.Flatten()
        )
Beispiel #20
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating SkillOpportunityModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating SkillOpportunityModel.
        """
        question_skill_link_models = (
            self.pipeline
            | 'Get all non-deleted QuestionSkillLinkModels' >>
            (ndb_io.GetModels(
                question_models.QuestionSkillLinkModel.get_all(
                    include_deleted=False)))
            | 'Group QuestionSkillLinkModels by skill ID' >>
            beam.GroupBy(lambda n: n.skill_id))

        skills = (
            self.pipeline
            | 'Get all non-deleted SkillModels' >> (ndb_io.GetModels(
                skill_models.SkillModel.get_all(include_deleted=False)))
            | 'Get skill object from model' >> beam.Map(
                skill_fetchers.get_skill_from_model)
            |
            'Group skill objects by skill ID' >> beam.GroupBy(lambda m: m.id))

        skills_with_question_counts = (
            {
                'skill': skills,
                'question_skill_links': question_skill_link_models
            }
            | 'Merge by skill ID' >> beam.CoGroupByKey()
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Remove skill IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            # We are using itertools.chain.from_iterable to flatten
            # question_skill_links from a 2D list into a 1D list.
            | 'Flatten skill and question_skill_links' >> beam.Map(
                lambda object: {
                    'skill':
                    list(object['skill'][0])[0],
                    'question_skill_links':
                    list(
                        itertools.chain.from_iterable(object[
                            'question_skill_links']))
                }))

        opportunities_results = (
            skills_with_question_counts
            | beam.Map(lambda object: self._create_skill_opportunity_model(
                object['skill'], object['question_skill_links'])))

        unused_put_result = (
            opportunities_results
            | 'Filter the results with OK status' >>
            beam.Filter(lambda result: result.is_ok())
            | 'Fetch the models to be put' >>
            beam.Map(lambda result: result.unwrap())
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (opportunities_results
                | 'Transform Results to JobRunResults' >>
                (job_result_transforms.ResultsToJobRunResults()))
Beispiel #21
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the skill migration.

        Returns:
            PCollection. A PCollection of results from the skill migration.
        """
        unmigrated_skill_models = (
            self.pipeline
            | 'Get all non-deleted skill models' >>
            (ndb_io.GetModels(skill_models.SkillModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill model ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_model: skill_model.id))
        skill_summary_models = (
            self.pipeline
            | 'Get all non-deleted skill summary models' >>
            (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill summary ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_summary_model: skill_summary_model.id))

        migrated_skill_results = (unmigrated_skill_models
                                  | 'Transform and migrate model' >>
                                  beam.MapTuple(self._migrate_skill))
        migrated_skills = (
            migrated_skill_results
            | 'Filter oks' >>
            beam.Filter(lambda result_item: result_item.is_ok())
            |
            'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap()))
        migrated_skill_job_run_results = (
            migrated_skill_results
            | 'Generate results for migration' >>
            (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED')))

        skill_changes = (unmigrated_skill_models
                         | 'Generate skill changes' >> beam.FlatMapTuple(
                             self._generate_skill_changes))

        skill_objects_list = (
            {
                'skill_model': unmigrated_skill_models,
                'skill_summary_model': skill_summary_models,
                'skill': migrated_skills,
                'skill_changes': skill_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated skills' >> beam.Filter(
                lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0)
            | 'Reorganize the skill objects' >> beam.Map(
                lambda objects: {
                    'skill_model': objects['skill_model'][0],
                    'skill_summary_model': objects['skill_summary_model'][0],
                    'skill': objects['skill'][0],
                    'skill_changes': objects['skill_changes']
                }))

        skill_objects_list_job_run_results = (
            skill_objects_list
            | 'Transform skill objects into job run results' >>
            (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED')
             ))

        cache_deletion_job_run_results = (
            skill_objects_list
            | 'Delete skill from cache' >>
            beam.Map(lambda skill_object: self._delete_skill_from_cache(
                skill_object['skill']))
            | 'Generate results for cache deletion' >>
            (job_result_transforms.ResultsToJobRunResults('CACHE DELETION')))

        skill_models_to_put = (
            skill_objects_list
            | 'Generate skill models to put' >>
            beam.FlatMap(lambda skill_objects: self._update_skill(
                skill_objects['skill_model'],
                skill_objects['skill'],
                skill_objects['skill_changes'],
            )))

        skill_summary_models_to_put = (
            skill_objects_list
            | 'Generate skill summary models to put' >>
            beam.Map(lambda skill_objects: self._update_skill_summary(
                skill_objects['skill'], skill_objects['skill_summary_model'])))

        unused_put_results = (
            (skill_models_to_put, skill_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (
            (cache_deletion_job_run_results, migrated_skill_job_run_results,
             skill_objects_list_job_run_results)
            | beam.Flatten())
Beispiel #22
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        deleted_user_ids_collection = (
            self.pipeline
            | 'Get all deleted user models' >> ndb_io.GetModels(
                user_models.DeletedUserModel.get_all())
            | 'Extract user IDs' >>
            beam.Map(lambda deleted_user_model: deleted_user_model.id))
        deleted_user_ids = beam.pvalue.AsIter(deleted_user_ids_collection)

        sent_email_models_to_delete = (
            self.pipeline
            | 'Get all sent email models' >> ndb_io.GetModels(
                email_models.SentEmailModel.get_all())
            | 'Filter sent email models that belong to deleted users' >>
            (beam.Filter(lambda model, ids:
                         (model.sender_id in ids or model.recipient_id in ids),
                         ids=deleted_user_ids)))
        sent_email_models_to_delete_result = (
            sent_email_models_to_delete
            | 'Count sent email models to be deleted' >>
            (job_result_transforms.CountObjectsToJobRunResult('SENT EMAILS')))

        bulk_email_models_to_delete = (
            self.pipeline
            | 'Get all bulk email models' >> ndb_io.GetModels(
                email_models.BulkEmailModel.get_all())
            | 'Filter bulk email models that belong to deleted users' >>
            (beam.Filter(lambda model, ids: model.sender_id in ids,
                         ids=deleted_user_ids)))
        bulk_email_models_to_delete_result = (
            bulk_email_models_to_delete
            | 'Count bulk email models to be deleted' >>
            (job_result_transforms.CountObjectsToJobRunResult('BULK EMAILS')))

        unsent_feedback_email_models_to_delete = (
            self.pipeline
            | 'Get all unsent feedback models' >> ndb_io.GetModels(
                feedback_models.UnsentFeedbackEmailModel.get_all())
            | 'Filter unsent feedback models that belong to deleted users' >>
            (beam.Filter(lambda model, ids: model.id in ids,
                         ids=deleted_user_ids)))
        unsent_feedback_email_models_to_delete_result = (
            unsent_feedback_email_models_to_delete
            | 'Count unsent feedback email models to be deleted' >>
            (job_result_transforms.CountObjectsToJobRunResult(
                'FEEDBACK EMAILS')))

        user_bulk_emails_models_to_delete = (
            self.pipeline
            | 'Get all user bulk email models' >> ndb_io.GetModels(
                user_models.UserBulkEmailsModel.get_all())
            | 'Filter user bulk email models that belong to deleted users' >>
            (beam.Filter(lambda model, ids: model.id in ids,
                         ids=deleted_user_ids)))
        user_bulk_emails_models_to_delete_result = (
            user_bulk_emails_models_to_delete
            | 'Count user bulk email models to be deleted' >>
            (job_result_transforms.CountObjectsToJobRunResult(
                'USER BULK EMAILS')))

        unused_models_deletion = (
            (sent_email_models_to_delete, bulk_email_models_to_delete,
             unsent_feedback_email_models_to_delete,
             user_bulk_emails_models_to_delete)
            | 'Merge models' >> beam.Flatten()
            | 'Extract keys' >> beam.Map(lambda model: model.key)
            | 'Delete models' >> ndb_io.DeleteModels())

        return ((
            sent_email_models_to_delete_result,
            bulk_email_models_to_delete_result,
            unsent_feedback_email_models_to_delete_result,
            user_bulk_emails_models_to_delete_result,
        )
                | 'Merge results' >> beam.Flatten())