def run(self) -> beam.PCollection[job_run_result.JobRunResult]: user_settings_models = ( self.pipeline | 'Get all UserSettingsModels' >> (ndb_io.GetModels(user_models.UserSettingsModel.get_all()))) old_user_stats_models = ( self.pipeline | 'Get all UserStatsModels' >> (ndb_io.GetModels(user_models.UserStatsModel.get_all()))) # Creates UserStatsModels if it does not exists. new_user_stats_models = ( (user_settings_models, old_user_stats_models) | 'Merge models' >> beam.Flatten() # Returns a PCollection of # (model.id, (user_settings_models, user_stats_models)) or # (model.id, (user_settings_models,)). | 'Group models with same ID' >> beam.GroupBy(lambda m: m.id) # Discards model.id from the PCollection. | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter # Only keep groupings that indicate that # the UserStatsModel is missing. | 'Filter pairs of models' >> beam.Filter(lambda models: (len(list(models)) == 1 and isinstance( list(models)[0], user_models.UserSettingsModel))) # Choosing the first element. | 'Transform tuples into models' >> beam.Map(lambda models: list(models)[0]) # Creates the missing UserStatsModels. | 'Create new user stat models' >> beam.ParDo( CreateUserStatsModel())) unused_put_result = ( (new_user_stats_models, old_user_stats_models) | 'Merge new and old models together' >> beam.Flatten() | 'Update the dashboard stats' >> beam.ParDo( UpdateWeeklyCreatorStats()) | 'Put models into the datastore' >> ndb_io.PutModels()) new_user_stats_job_result = ( new_user_stats_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS NEW %s' % x))) old_user_stats_job_result = ( old_user_stats_models | 'Count all old models' >> beam.combiners.Count.Globally() | 'Only create result for old models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for old models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS OLD %s' % x))) return ((new_user_stats_job_result, old_user_stats_job_result) | 'Merge new and old results together' >> beam.Flatten())
def run(self) -> beam.Pipeline: return (self.pipeline | 'Get every Blog Summary Model' >> (ndb_io.GetModels(blog_models.BlogPostSummaryModel.query())) | GetModelsWithDuplicatePropertyValues('title') | 'Flatten models into a list of errors' >> beam.FlatMap(lambda models: [ blog_validation_errors.DuplicateBlogTitleError(model) for model in models ]))
def run(self): # type: () -> base_jobs.JobBase.pipeline return (self.pipeline | 'Get every Blog Post Model' >> ( ndb_io.GetModels( # type: ignore[no-untyped-call] blog_models.BlogPostModel.query())) | GetModelsWithDuplicatePropertyValues('url_fragment') | 'Flatten models into a list of errors' >> beam.FlatMap(lambda models: [ blog_validation_errors.DuplicateBlogUrlError(model) for model in models ]))
def run(self): # type: () -> base_jobs.JobBase.pipeline return (self.pipeline | 'Get every Blog Summary Model' >> ( ndb_io.GetModels( # type: ignore[no-untyped-call] blog_models.BlogPostSummaryModel.query(), self.datastoreio_stub)) | GetModelsWithDuplicatePropertyValues('title') | 'Flatten models into a list of errors' >> beam.FlatMap(lambda models: [ blog_validation_errors.DuplicateBlogTitleError(model) for model in models ]))
def test_read_from_datastore(self) -> None: model_list = [ self.create_model(base_models.BaseModel, id='a'), self.create_model(base_models.BaseModel, id='b'), self.create_model(base_models.BaseModel, id='c'), ] self.put_multi(model_list) self.assertItemsEqual(self.get_base_models(), model_list) # type: ignore[no-untyped-call] model_pcoll = (self.pipeline | ndb_io.GetModels(base_models.BaseModel.get_all())) self.assert_pcoll_equal(model_pcoll, model_list)
def test_read_from_datastore(self): model_list = [ self.create_model(base_models.BaseModel, id='a'), self.create_model(base_models.BaseModel, id='b'), self.create_model(base_models.BaseModel, id='c'), ] self.put_multi(model_list) self.assertItemsEqual(self.get_everything(), model_list) model_pcoll = ( self.pipeline | ndb_io.GetModels( datastore_services.query_everything(namespace=self.namespace))) self.assert_pcoll_equal(model_pcoll, model_list)
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. """ return ( self.pipeline | 'Get all non-deleted models' >> (ndb_io.GetModels( exp_models.ExpSummaryModel.get_all(include_deleted=False))) | 'Split models into batches' >> beam.transforms.util.BatchElements( max_batch_size=self.MAX_BATCH_SIZE) | 'Index batches of models' >> beam.ParDo( IndexExplorationSummaries()))
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from the Elastic Search. """ exp_summary_models = ( self.pipeline | 'Get all non-deleted models' >> (ndb_io.GetModels(exp_models.ExpSummaryModel.get_all()))) exp_summary_iter = beam.pvalue.AsIter(exp_summary_models) exp_recommendations_models = ( exp_summary_models | 'Compute similarity' >> beam.ParDo(ComputeSimilarity(), exp_summary_iter) | 'Group similarities per exploration ID' >> beam.GroupByKey() | 'Sort and slice similarities' >> beam.MapTuple( lambda exp_id, similarities: (exp_id, self._sort_and_slice_similarities(similarities))) | 'Create recommendation models' >> beam.MapTuple( self._create_recommendation)) unused_put_result = ( exp_recommendations_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (exp_recommendations_models | 'Count all new models' >> beam.combiners.Count.Globally() | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)))
def run(self): """Returns a PCollection of audit errors aggregated from all models. Returns: PCollection. A PCollection of audit errors discovered during the audit. Raises: ValueError. When the `datastoreio` option, which provides the PTransforms for performing datastore IO operations, is None. """ existing_models, deleted_models = ( self.pipeline | 'Get all models' >> ndb_io.GetModels( datastore_services.query_everything(), self.datastoreio_stub) | 'Partition by model.deleted' >> ( beam.Partition(lambda model, _: int(model.deleted), 2)) ) models_of_kind_by_index = ( existing_models # NOTE: Partition returns a statically-sized list of PCollections. # Creating partitions is wasteful when there are fewer items than # there are partitions, like in our unit tests. In exchange, in # production the job will be able to take advantage of the high # parallelizability of PCollections, which are designed for enormous # datasets and parallel processing. # # Alternatively, we could have used GroupBy. However, that returns # an _iterable_ of items rather than a PCollection, and so it is # vulnerable to out-of-memory errors. # # Since this job is concerned with running audits on EVERY MODEL IN # STORAGE, Partition is the clear winner regardless of the overhead # we'll see in unit tests. | 'Split models into parallelizable PCollections' >> beam.Partition( lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)), # NOTE: Partition requires a hard-coded number of slices; it # cannot be used with dynamic numbers generated in a pipeline. # KIND_BY_INDEX is a constant tuple so that requirement is # satisfied in this case. len(KIND_BY_INDEX), KIND_BY_INDEX) ) existing_key_count_pcolls = [] missing_key_error_pcolls = [] audit_error_pcolls = [ deleted_models | 'Apply ValidateDeletedModel on deleted models' >> ( beam.ParDo(base_validation.ValidateDeletedModel())) ] model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index) for kind, models_of_kind in model_groups: audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind)) if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES: existing_key_count_pcolls.append( models_of_kind | GetExistingModelKeyCounts(kind)) if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR: missing_key_error_pcolls.extend( models_of_kind | GetMissingModelKeyErrors(kind)) existing_key_counts = ( existing_key_count_pcolls | 'Flatten PCollections of existing key counts' >> beam.Flatten() ) missing_key_errors = ( missing_key_error_pcolls | 'Flatten PCollections of missing key errors' >> beam.Flatten() ) audit_error_pcolls.append( (existing_key_counts, missing_key_errors) | 'Group counts and errors by key' >> beam.CoGroupByKey() | 'Filter keys without any errors' >> ( beam.FlatMapTuple(self._get_model_relationship_errors)) ) return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Generates the translation contributins stats. Returns: PCollection. A PCollection of 'SUCCESS x' results, where x is the number of generated stats.. """ suggestions_grouped_by_target = ( self.pipeline | 'Get all non-deleted suggestion models' >> ndb_io.GetModels( suggestion_models.GeneralSuggestionModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Window the suggestions' >> beam.WindowInto( beam.window.Sessions(10 * 60)) | 'Filter translate suggestions' >> beam.Filter(lambda m: ( m.suggestion_type == feconf.SUGGESTION_TYPE_TRANSLATE_CONTENT)) | 'Transform to suggestion domain object' >> beam.Map( suggestion_services.get_suggestion_from_model) | 'Group by target' >> beam.GroupBy(lambda m: m.target_id)) exp_opportunities = ( self.pipeline | 'Get all non-deleted opportunity models' >> ndb_io.GetModels( opportunity_models.ExplorationOpportunitySummaryModel.get_all( include_deleted=False)) # We need to window the models so that CoGroupByKey below # works properly. | 'Window the opportunities' >> beam.WindowInto( beam.window.Sessions(10 * 60)) | 'Transform to opportunity domain object' >> beam.Map(opportunity_services. get_exploration_opportunity_summary_from_model) | 'Group by ID' >> beam.GroupBy(lambda m: m.id)) new_user_stats_models = ( { 'suggestion': suggestions_grouped_by_target, 'opportunity': exp_opportunities } | 'Merge models' >> beam.CoGroupByKey() | 'Get rid of key' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Generate stats' >> beam.ParDo(lambda x: self._generate_stats( x['suggestion'][0] if len(x['suggestion']) else [], x[ 'opportunity'][0][0] if len(x['opportunity']) else None)) | 'Group by key' >> beam.GroupByKey() | 'Combine the stats' >> beam.CombineValues(CombineStats()) | 'Generate models from stats' >> beam.MapTuple( self._generate_translation_contribution_model)) unused_put_result = ( new_user_stats_models | 'Put models into the datastore' >> ndb_io.PutModels()) return (new_user_stats_models | 'Count all new models' >> (beam.combiners.Count.Globally().without_defaults()) | 'Only create result for new models when > 0' >> (beam.Filter(lambda x: x > 0)) | 'Create result for new models' >> beam.Map(lambda x: job_run_result.JobRunResult( stdout='SUCCESS %s' % x)))