def testGetSetContext(self): """Test module's get_context and _set functions.""" ctx = context.Context(None, None) self.assertFalse(context.get()) context.Context._set(ctx) self.assertEquals(ctx, context.get()) context.Context._set(None) self.assertEquals(None, context.get())
def discover_events_from_sources(fbl, sources): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params min_potential_events = params.get('min_potential_events', 0) sources = [ x for x in sources if min_potential_events <= (x.num_potential_events or 0) ] # don't scrape sources that prove useless and give mostly garbage events #sources = [x for x in sources if x.fraction_potential_are_real() > 0.05] logging.info("Looking up sources: %s", [x.graph_id for x in sources]) fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources]) fbl.batch_fetch() logging.info("Fetched %s URLs, saved %s updates", fbl.fb_fetches, fbl.db_updates) discovered_list = set() for source in sources: try: thing_feed = fbl.fetched_data(fb_api.LookupThingFeed, source.graph_id) discovered_list.update(process_thing_feed(source, thing_feed)) except fb_api.NoFetchedDataException, e: logging.warning("Failed to fetch data for thing: %s", str(e))
def index_map(data): """Index map function. Args: data: Refers to a line from the input files. It is actually composed of a tuple (lineinfo, line). This is the return value from the ZipLineInputReader, available among the input readers of mapreduce. Yields: A tuple in the format <word>_SEP<title>_SEP<character>, <serial_line_key>. line_key needs to be serialized because it is an object and reduce expects strings as input. """ info, line = data if line.strip() == '': return _, file_index, offset = info ctx = context.get() params = ctx.mapreduce_spec.mapper.params metadata_blob = params['metadata'] blob_reader = blobstore.BlobReader(metadata_blob[0].split('/')[-1]) all_meta = blob_reader.read().split('\n')[:-1] # the last one is empty metadata = select_metadata(all_meta, file_index) char_map = metadata['pos_to_char'] sorted_offsets = metadata['sorted_offsets'] character = get_character(char_map, sorted_offsets, offset) title = metadata['title'] line_db = Line(line=line) line_key = line_db.put() for word in get_words(line.lower()): yield (word + _SEP + title + _SEP + character, pickle.dumps(line_key))
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filenames) pool_name = "kv_pool%d" % file_index filename = self._filenames[file_index] if ctx.get_pool(pool_name) is None: ctx.register_pool(pool_name, output_writers.RecordsPool(filename=filename, ctx=ctx)) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) ctx.get_pool(pool_name).append(proto.Encode())
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filehandles) # Work-around: Since we don't have access to the context in the to_json() # function, but we need to flush each pool before we serialize the # filehandle, we rely on a member variable instead of using context for # pool management. pool = self._pools[file_index] if pool is None: filehandle = self._filehandles[file_index] pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx) self._pools[file_index] = pool proto = kv_pb.KeyValue() proto.set_key(key) proto.set_value(value) pool.append(proto.Encode())
def map(entity): mapper_params = context.get().mapreduce_spec.mapper.params namespace = mapper_params['course'] unit_id = mapper_params['unit_id'] content = mapper_params['content'] settings = mapper_params['settings'] app_context = sites.get_app_context_for_namespace(namespace) course = courses.Course(None, app_context=app_context) unit = course.find_unit_by_id(str(unit_id)) submitted_contents = student_work.Submission.get_contents( unit.unit_id, entity.get_key()) if not submitted_contents: return submission = transforms.loads(submitted_contents) if not submission: return lang = submission.keys()[0] code = submission[lang]['code'] filename = submission[lang]['filename'] evaluator_id = content.get('evaluator', 'mooshak') evaluator_class = evaluator.ProgramEvaluatorRegistory.get(evaluator_id) if not evaluator_class: return old_score = course.get_score(entity, unit.unit_id) prog_evaluator = evaluator_class(course, settings, unit, content) prog_evaluator.evaluate(entity, False, lang, filename, code) new_score = course.get_score(entity, unit.unit_id) yield (str(old_score), new_score)
def row_handler(line_details): year = context.get().mapreduce_spec.mapper.params.get('student_year') pos, line = line_details if not pos: # skip header raise StopIteration() if not year: logging.error('Failed to create a new student. The year is missing') raise StopIteration() row = csv.reader(StringIO.StringIO(line)).next() if len(row) < 4: logging.error( 'Failed to create a new student. Some fields are missing: %s', line) raise StopIteration() _, surname, name, email = row[0:4] try: entity = Student.new_student(surname, name, email=email, year=year, commit=False) except (ValueError, TypeError, ValidationError) as e: logging.error( 'Failed to create a new student. wrong fields: %s.\n' 'Error: %s', line, e) raise StopIteration yield op.db.Put(entity) yield op.counters.Increment('student added/updated', 1)
def discover_events_from_sources(fbl, sources): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params min_potential_events = params.get('min_potential_events', 0) sources = [x for x in sources if min_potential_events <= (x.num_potential_events or 0)] # Maybe we can build this into the upfront mapreduce filter? # Unfortunately, '!='' is more difficult to do and requires better schema planning, # so let's just do this for now. # Hopefully this also prevents the API Rate limits on GET {user-id} lookups. sources = [x for x in sources if x.graph_type != thing_db.GRAPH_TYPE_PROFILE] # don't scrape sources that prove useless and give mostly garbage events #sources = [x for x in sources if x.fraction_potential_are_real() > 0.05] if fbl.allow_cache: logging.error('discover_events_from_sources unexpectedly called with a disabled cache!') logging.info("Looking up sources: %s", [x.graph_id for x in sources]) fbl.request_multi(fb_api.LookupThingCommon, [x.graph_id for x in sources]) # Now based on the sources we know, grab the appropriate fb data for s in sources: fbl.request(thing_db.get_lookup_for_graph_type(s.graph_type), s.graph_id) fbl.batch_fetch() logging.info("Fetched %s URLs, saved %s updates", fbl.fb_fetches, fbl.db_updates) discovered_list = set() for source in sources: try: discovered_list.update(_process_thing_feed(fbl, source)) except fb_api.NoFetchedDataException, e: logging.warning("Failed to fetch data for thing: %s", str(e))
def process(org_app): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_type = params['program_type'] program_key_str = params['program_key'] # now the script is used only for GCI if program_type != 'gci': return program = GCIProgram.get_by_key_name(program_key_str) survey_query = OrgAppSurvey.all(keys_only=True).filter('program', program) survey_key = survey_query.get() # We can skip the survey records not belonging to the given program. if org_app.survey.key() != survey_key: return # TODO(daniel): create a MapReduce/Task RequestData data = MapreduceRequestData(program, Site.get_by_key_name('site')) absolute_url = links.ABSOLUTE_LINKER.program( program, gci_url_names.CREATE_GCI_ORG_PROFILE) if org_app.status == 'pre-accepted': org_app_logic.setStatus(data, org_app, 'accepted', absolute_url) yield operation.counters.Increment("proposals_accepted") elif org_app.status == 'pre-rejected': org_app_logic.setStatus(data, org_app, 'rejected', absolute_url) yield operation.counters.Increment("proposals_rejected") else: yield operation.counters.Increment("proposals_ignored")
def process(task): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] try: program = GCIProgram.get_by_key_name(program_key) except db.BadValueError: yield operation.counters.Increment('program_key_is_empty_or_invalid') return def subscribe_to_task_txn(task_key, subscribe): task = GCITask.get(task_key) task.subscribers = list(set(task.subscribers + subscribe)) task.put() return task if task.program.key() != program.key(): yield operation.counters.Increment("old_program_task_not_updated") return mentors = db.get(task.mentors) entities = mentors + [task.created_by, task.modified_by] subscribe = [ ent.key() for ent in entities if ent.automatic_task_subscription ] result = db.run_in_transaction(subscribe_to_task_txn, task.key(), subscribe) if result: yield operation.counters.Increment("task_updated") else: yield operation.counters.Increment("task_not_updated")
def map(entity): mapper_params = context.get().mapreduce_spec.mapper.params namespace = mapper_params['course'] unit_id = mapper_params['unit_id'] ignore_order = mapper_params['ignore_order'] app_context = sites.get_app_context_for_namespace(namespace) course = courses.Course(None, app_context=app_context) unit = course.find_unit_by_id(str(unit_id)) if verify.UNIT_TYPE_ASSESSMENT == unit.type: grader = unit.workflow.get_grader() if grader == courses.AUTO_GRADER: pass else: return else: return enable_negative_marking = unit.enable_negative_marking submission = student_work.Submission.get_contents( unit.unit_id, entity.get_key()) if not submission: return old_score = course.get_score(entity, unit.unit_id) new_score = scorer.score_assessment(submission, unit.html_content, enable_negative_marking, ignore_order=ignore_order) utils.set_score(entity, unit.unit_id, new_score) entity.put() yield (str(old_score), new_score)
def create_channel_assignments(entity): # Get network graph logging.info('Creating channel assignments') network_graph = entity.recreate_network_graph() # Get parameter values ctx = context.get() params = ctx.mapreduce_spec.mapper.params min_links = int(params['min_links']) min_neighbors = int(params['min_neighbors']) num_channels = int(params['num_channels']) node_selector = NodeSelector(network_graph, min_links, min_neighbors) selected_nodes = node_selector.select_nodes() logging.info('Number of selected nodes: ' + str(len(selected_nodes))) channel_assignment_initializer = ChannelAssignmentInitializer(selected_nodes, num_channels) names_to_numbers = channel_assignment_initializer.names_to_numbers names_to_numbers_entity = NamesToNumbersModel(names_to_numbers=str(names_to_numbers)) names_to_numbers_entity.put() num_nodes = len(names_to_numbers) for combination in product(range(num_channels), repeat=num_nodes): ca_entity = ChannelAssignmentModel(channel_assignment=str(combination)) ca_entity.put()
def build_search_index(readbuffer): # readbuffer should be a tuple from GoogleCloudLineInputReader composed of a # tuple of the form ((file_name, offset), line) # Get namespace from mapreduce job and set it. ctx = context.get() params = ctx.mapreduce_spec.mapper.params namespace = params['namespace'] index_name = params['index_name'] rightnow=datetime.now() today=rightnow.day if today < 10: day='0%s' % today else: day='%s' % today thismonth=rightnow.month if thismonth < 10: month='0%s' % thismonth else: month='%s' % thismonth indexdate='%s-%s-%s' % (rightnow.year, month, day) try: # Get the row out of the input buffer row=readbuffer[1] # Create a dictionary from the HEADER and the row data = get_rec_dict(dict(zip(HEADER, row.split('\t')))) # logging.info('Data from %s offset %s: %s' % (readbuffer[0][0], readbuffer[0][1], data)) # Create an index document from the row dictionary doc = index_record(data, indexdate) # Store the document in the given index index_doc(doc, index_name, namespace) except Exception, e: logging.error('%s\n%s' % (e, readbuffer))
def test_user_on_events(user): logging.info("Trying user %s (expired %s)", user.fb_uid, user.expired_oauth_token) if user.expired_oauth_token: return # TODO: Relies on us keeping these up to date! if not user.num_auto_added_events and not user.num_hand_added_events: return ctx = context.get() params = ctx.mapreduce_spec.mapper.params event_ids = params['event_ids'].split(',') fbl = user.get_fblookup() fbl.allow_cache = False try: fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids) except fb_api.ExpiredOAuthToken: # Not longer a valid source for access_tokens return found_fb_events = [x for x in fb_events if x and not x['empty']] for fb_event in found_fb_events: yield (fb_event['info']['id'], user.fb_uid) # Found some good stuff, let's save and update the db events found_db_events = eventdata.DBEvent.get_by_ids([x['info']['id'] for x in found_fb_events]) db_fb_events = [] for db_event, new_fb_event in zip(found_db_events, found_fb_events): if db_event.has_content(): db_fb_events.append((db_event, new_fb_event)) event_updates.update_and_save_fb_events(db_fb_events)
def __call__(self, entity): self.slice_count += 1 yield "%s\n" % entity.int_property slice_id = context.get()._shard_state.slice_id # Raise exception when processing the 2 item in a slice every 3 slices. if (self.slice_count == 2 and (slice_id + 1) % 3 == 0): raise Exception("Intentionally raise an exception")
def delete_for_program(entity): """Permanently delete anything in the datastore from a program. Works with Projects, ProjectCohorts, and Surveys. Also requires a param set in mapreduce.yaml: `program_label`. """ params = context.get().mapreduce_spec.mapper.params if getattr(entity, 'program_label', None) != params['program_label']: return # If this is a project cohort, delete the Unique entity that serves as an # index of participation codes. key_name = ProjectCohort.uniqueness_key(getattr(entity, 'code', '')) unique_entity = ndb.Key('Unique', key_name).get() if unique_entity: yield op.db.Delete(unique_entity) # Some entities have tasks in their entity group. There's no convenient # way to query tasks directly, so delete them while we're handling their # parent. # Bypass DatastoreModel to make sure we get soft-deleted entities. for task in Task.query(ancestor=entity.key).iter(): yield op.db.Delete(task) yield op.db.Delete(entity)
def process(comment): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] program = GCIProgram.get_by_key_name(program_key) if comment.parent().program.key() != program.key(): yield operation.counters.Increment( "prev_program_comment_not_converted") return if comment.title not in ACTION_TITLES: yield operation.counters.Increment("user_comment_not_converted") return comment_title = ACTION_TITLES[comment.title] changes = ACTION_TITLES[comment_title] # Task reopening is a special case which could have been performed # either by a mentor or by the automated system after the passing of # the deadline. So additional inference of the user has to be made. if comment_title == 'Task Reopened': if comment.created_by: user_info = ugettext('User-Mentor') else: user_info = ugettext('MelangeAutomatic') changes = [user_info] + changes comment.changes = changes yield operation.db.Put(comment) yield operation.counters.Increment("action_comment_converted")
def process(entity): ctx=context.get() params=ctx.mapreduce_spec.mapper.params dbid=params['dbid'] viewName=params['viewName'] viewUrl=params['nodeUrl']+'/'+viewName map(entity, dbid, viewName, viewUrl)
def test_user_on_events(user): logging.info("Trying user %s (expired %s)", user.fb_uid, user.expired_oauth_token) if user.expired_oauth_token: return # TODO: Relies on us keeping these up to date! if not user.num_auto_added_events and not user.num_hand_added_events: return ctx = context.get() params = ctx.mapreduce_spec.mapper.params event_ids = params['event_ids'].split(',') fbl = user.get_fblookup() fbl.allow_cache = False try: fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids) except fb_api.ExpiredOAuthToken: # Not longer a valid source for access_tokens return found_fb_events = [x for x in fb_events if x and not x['empty']] for fb_event in found_fb_events: yield (fb_event['info']['id'], user.fb_uid) # Found some good stuff, let's save and update the db events found_db_events = eventdata.DBEvent.get_by_ids( [x['info']['id'] for x in found_fb_events]) db_fb_events = [] for db_event, new_fb_event in zip(found_db_events, found_fb_events): if db_event.has_content(): db_fb_events.append((db_event, new_fb_event)) event_updates.update_and_save_fb_events(db_fb_events)
def process(task): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] try: program = GCIProgram.get_by_key_name(program_key) except db.BadValueError: yield operation.counters.Increment('program_key_is_empty_or_invalid') return def subscribe_to_task_txn(task_key, subscribe): task = GCITask.get(task_key) task.subscribers = list(set(task.subscribers + subscribe)) task.put() return task if task.program.key() != program.key(): yield operation.counters.Increment("old_program_task_not_updated") return mentors = db.get(task.mentors) entities = mentors + [task.created_by, task.modified_by] subscribe = [ent.key() for ent in entities if ent.automatic_task_subscription] result = db.run_in_transaction(subscribe_to_task_txn, task.key(), subscribe) if result: yield operation.counters.Increment("task_updated") else: yield operation.counters.Increment("task_not_updated")
def reduce_hpo_metric_date_deltas_to_all_date_counts(reducer_key, reducer_values, now=None): """Emits hpoId|participant_type|metric|date|count for each date until today. Args: reducer_key: hpoId|participant_type|metric reducer_values: list of date|delta strings now: use to set the clock for testing """ delta_map = {} sum_deltas(reducer_values, delta_map) # Walk over the deltas by date last_date = None count = 0 one_day = timedelta(days=1) now = now or context.get().mapreduce_spec.mapper.params.get('now') for date_str, delta in sorted(delta_map.items()): date = datetime.strptime(date_str, DATE_FORMAT).date() if date > now.date(): # Ignore any data after the current run date. break # Yield results for all the dates in between if last_date: middle_date = last_date + one_day while middle_date < date: yield reduce_result_value(reducer_key, middle_date.isoformat(), count) middle_date = middle_date + one_day count += delta if count > 0: yield reduce_result_value(reducer_key, date_str, count) last_date = date # Yield results up until today. if count > 0 and last_date: last_date = last_date + one_day while last_date <= now.date(): yield reduce_result_value(reducer_key, last_date.isoformat(), count) last_date = last_date + one_day
def reduce_hpo_date_metric_counts_to_database_buckets(reducer_key, reducer_values, version_id=None): """Emits a metrics bucket with counts for metrics for a given hpoId + date to SQL Args: reducer_key: hpoId|date ('*' for hpoId for cross-HPO counts) reducer_values: list of participant_type|metric|count strings """ metrics_dict = collections.defaultdict(lambda: 0) (hpo_id, date_str) = parse_tuple(reducer_key) if hpo_id == '*': hpo_id = '' date = datetime.strptime(date_str, DATE_FORMAT) for reducer_value in reducer_values: (participant_type, metric_key, count) = parse_tuple(reducer_value) if metric_key == PARTICIPANT_KIND: if participant_type == _REGISTERED_PARTICIPANT: metrics_dict[metric_key] += int(count) else: kind = FULL_PARTICIPANT_KIND if participant_type == _FULL_PARTICIPANT else PARTICIPANT_KIND metrics_dict['%s.%s' % (kind, metric_key)] += int(count) version_id = version_id or context.get().mapreduce_spec.mapper.params.get('version_id') bucket = MetricsBucket(metricsVersionId=version_id, date=date, hpoId=hpo_id, metrics=json.dumps(metrics_dict)) # Use upsert here; when reducer shards retry, we will just replace any metrics bucket that was # written before, rather than failing. MetricsBucketDao().upsert(bucket)
def process(comment): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] program = GCIProgram.get_by_key_name(program_key) if comment.parent().program.key() != program.key(): yield operation.counters.Increment("prev_program_comment_not_converted") return if comment.title not in ACTION_TITLES: yield operation.counters.Increment("user_comment_not_converted") return comment_title = ACTION_TITLES[comment.title] changes = ACTION_TITLES[comment_title] # Task reopening is a special case which could have been performed # either by a mentor or by the automated system after the passing of # the deadline. So additional inference of the user has to be made. if comment_title == 'Task Reopened': if comment.created_by: user_info = ugettext('User-Mentor') else: user_info = ugettext('MelangeAutomatic') changes = [user_info] + changes comment.changes = changes yield operation.db.Put(comment) yield operation.counters.Increment("action_comment_converted")
def run(self, event): if not self.oldest_last_modified: params = context.get().mapreduce_spec.mapper.params self.oldest_last_modified = datetime.datetime.utcfromtimestamp( params['oldest_last_modified']) if event.last_modified < self.oldest_last_modified: yield op.db.Delete(event)
def yield_post_jp_event(db_events): from mapreduce import context ctx = context.get() params = ctx.mapreduce_spec.mapper.params token_nickname = params.get('token_nickname') db_events = [x for x in db_events if x.actual_city_name and x.actual_city_name.endswith('Japan')] for db_event in db_events: pubsub.eventually_publish_event(db_event.id, token_nickname)
def map(item): mapper_params = context.get().mapreduce_spec.mapper.params user_ids_to_remove = set(mapper_params['user_ids']) item_user_ids = set(item.get_user_ids()) matching = item_user_ids.intersection(user_ids_to_remove) if matching: item.delete() for user_id in matching: yield user_id, 1
def word_count_reduce(key, values): """Word count reduce function.""" sentences = [] ctx = context.get() title = ctx.mapreduce_spec.mapper.params['output_writer']['book_title'] for sentence in values: sentences.append(models.Sentence(sentence=sentence, book=title)) word = models.Word(word=key, sentences=sentences) word.put()
def log2csv(l): """Convert log API RequestLog object to csv.""" root_pipeline_id = context.get().mapreduce_spec.mapper.params['root_pipeline_id'] message(root_pipeline_id, '<span class="label label-warning">pending</span> MapperPipeline.log2csv') yield '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (l.start_time, l.method, l.resource, l.status, l.latency, l.response_size, l.was_loading_request, l.cost, '"%s"' % l.user_agent if l.user_agent else "NULL", l.nickname if l.nickname else "NULL")
def denorm_entity_mapper(entity): ctx = context.get() params = ctx.mapreduce_spec.mapper.params entity._denorm_values = params['denorm_values'] # Instead of naive single save: entity.save(), do the following more efficient batch save: yield _batch_save(entity)
def yield_maybe_delete_bad_event(fbl, db_event): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params allow_deletes = params['allow_deletes'] else: allow_deletes = False if db_event.creating_method not in [ eventdata.CM_AUTO_ATTENDEE, eventdata.CM_AUTO ]: return if db_event.fb_event['empty']: return import datetime # This is when we started adding all sorts of "crap" if not db_event.creation_time or db_event.creation_time < datetime.datetime( 2016, 3, 5): return logging.info('MDBE: Check on event %s: %s', db_event.id, db_event.creating_method) from dancedeets.event_scraper import auto_add from dancedeets.nlp import event_classifier classified_event = event_classifier.get_classified_event(db_event.fb_event) good_text_event = auto_add.is_good_event_by_text(db_event.fb_event, classified_event) if good_text_event: if db_event.creating_method != eventdata.CM_AUTO: db_event.creating_method = eventdata.CM_AUTO yield op.db.Put(db_event) else: good_event = event_attendee_classifier.is_good_event_by_attendees( fbl, db_event.fb_event, classified_event=classified_event) if good_event: if db_event.creating_method != eventdata.CM_AUTO_ATTENDEE: db_event.creating_method = eventdata.CM_AUTO_ATTENDEE yield op.db.Put(db_event) else: logging.info('Accidentally %s added event %s: %s: %s', db_event.creating_method, db_event.fb_event_id, db_event.country, db_event.name) mr.increment('deleting-bad-event') result = '%s: %s: %s: %s\n' % (db_event.fb_event_id, db_event.creating_method, db_event.country, db_event.name) yield result.encode('utf-8') if allow_deletes: from dancedeets.search import search search.delete_from_fulltext_search_index(db_event.fb_event_id) yield op.db.Delete(db_event) display_event = search.DisplayEvent.get_by_id( db_event.fb_event_id) if display_event: yield op.db.Delete(display_event)
def map(item): mapper_params = context.get().mapreduce_spec.mapper.params user_ids_to_remove = set(mapper_params["user_ids"]) item_user_ids = set(item.get_user_ids()) matching = item_user_ids.intersection(user_ids_to_remove) if matching: item.delete() for user_id in matching: yield user_id, 1
def map(student): params = context.get().mapreduce_spec.mapper.params ns = params['course_namespace'] app_context = sites.get_course_index().get_app_context_for_namespace(ns) course = courses.Course(None, app_context=app_context) if student_is_qualified(student, course): yield(TOTAL_CERTIFICATES, 1) if student.scores: yield(TOTAL_ACTIVE_STUDENTS, 1) yield(TOTAL_STUDENTS, 1)
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if ctx.get_pool("file_pool") is None: ctx.register_pool("file_pool", _FilePool(ctx=ctx)) ctx.get_pool("file_pool").append(self._filename, str(data))
def map(entity): mapper_params = context.get().mapreduce_spec.mapper.params if entity.removed or entity.state == staff.REVIEW_STATE_COMPLETED: return course_staff_user_ids = mapper_params['course_staff_user_ids'] if entity.evaluator not in course_staff_user_ids: return remove_step(entity) assign_course_staff(entity)
def run(self, subscription): if self.topic_pattern is None: params = context.get().mapreduce_spec.mapper.params self.topic_pattern = re.compile(params['topic_pattern']) self.callback_pattern = re.compile(params['callback_pattern']) if self.topic_pattern.match(subscription.topic): the_match = self.callback_pattern.match(subscription.callback) if the_match: yield op.counters.Increment(the_match.group(1))
def process(org_key): """Processes a single organization. Organization status is updated to ACCEPTED or REJECTED if the current status has been set to PRE_ACCEPTED or PRE_REJECTED, respectively, by program administrators. Args: org_key: Organization key. """ context = mapreduce_context.get() program_key = db.Key(context.mapreduce_spec.mapper.params['program_key']) if program_key.kind() == 'GSoCProgram': url_names = soc_urls.UrlNames elif program_key.kind() == 'GCIProgram': url_names = ci_urls.UrlNames else: raise ValueError('Invalid program type %s' % program_key.kind()) program = db.get(program_key) site = site_logic.singleton() org_key = ndb.Key.from_old_key(org_key) org_admins = profile_logic.getOrgAdmins(org_key) # We are "prefetching" the ProgramMessages entity here instead of fetching # it where it is required i.e. when the message templates are required # to build the email message body. We do this because we perform the # operation of fetching the ProgramMessages entity if it exists or create # it if it doesn't in a Appengine regular "db" transation whereas rest # of the updating of organization entities happen within an ndb transaction # because Organization model is an ndb model and such cross API nested # transactions are incompatible in Appengine. program_messages = program.getProgramMessages() @ndb.transactional def updateOrganizationStatus(): """Transactionally updates organization status.""" # only organizations defined for the specified program should be processed organization = org_key.get() if organization.program.to_old_key() == program_key: if organization.status == org_model.Status.PRE_ACCEPTED: org_logic.setStatus( organization, program, site, program_messages, org_model.Status.ACCEPTED, links.ABSOLUTE_LINKER, url_names, org_admins=org_admins) elif organization.status == org_model.Status.PRE_REJECTED: org_logic.setStatus( organization, program, site, program_messages, org_model.Status.REJECTED, links.ABSOLUTE_LINKER, url_names, org_admins=org_admins) updateOrganizationStatus()
def yield_load_fb_event(fbl, all_events): ctx = context.get() if ctx: params = ctx.mapreduce_spec.mapper.params update_geodata = params['update_geodata'] only_if_updated = params['only_if_updated'] else: update_geodata = True only_if_updated = True # Process web_events web_events = [x for x in all_events if not x.is_fb_event] events_to_update = [] for web_event in web_events: if event_updates.need_forced_update(web_event): events_to_update.append((web_event, web_event.web_event)) event_updates.update_and_save_web_events(events_to_update, update_geodata=update_geodata) # Now process fb_events db_events = [x for x in all_events if x.is_fb_event] logging.info("loading db events %s", [db_event.fb_event_id for db_event in db_events]) fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in db_events]) # fbl.request_multi(fb_api.LookupEventPageComments, [x.fb_event_id for x in db_events]) fbl.batch_fetch() events_to_update = [] empty_fb_event_ids = [] for db_event in db_events: try: real_fb_event = fbl.fetched_data(fb_api.LookupEvent, db_event.fb_event_id) # If it's an empty fb_event with our main access token, and we have other tokens we'd like to try... # If there are no visible_to_fb_uids and we don't have permissions, then we don't do this... # # TODO: This would happen on event deletion? # # TODO: Also, who sets visible_to_fb_uids? Why didn't this event have any? # TODO: Who re-sets visible_to_fb_uids after it goes empty? Can we ensure that keeps going? # # TODO: And what happens if we have a deleted event, with visible_to_fb_uids, that we attempt to run and query, and nothing happens? # Should we distinguish between deleted (and inaccessible) and permissions-lost-to-token (and inaccessible)? # # TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids: empty_fb_event_ids.append(db_event.fb_event_id) else: # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here. add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated) except fb_api.NoFetchedDataException as e: logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e) # Now trigger off a background reloading of empty fb_events if empty_fb_event_ids: logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids) deferred.defer(load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, update_geodata=update_geodata) logging.info("Updating events: %s", [x[0].id for x in events_to_update]) # And then re-save all the events in here event_updates.update_and_save_fb_events(events_to_update, update_geodata=update_geodata)
def _entity_created_before_job_queued(entity): """Checks that the given entity was created before the MR job was queued. Mapper methods may want to use this as a precomputation check, especially if the datastore classes being iterated over are append-only event logs. """ created_on_msec = utils.get_time_in_millisecs(entity.created_on) job_queued_msec = float(context.get().mapreduce_spec.mapper. params[MAPPER_PARAM_KEY_QUEUED_TIME_MSECS]) return job_queued_msec >= created_on_msec
def write(self, data): """Write data to the GoogleCloudStorage file. Args: data: string containing the data to be written. """ start_time = time.time() self._streaming_buffer.write(data) ctx = context.get() operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx) operation.counters.Increment(COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
def yield_post_jp_event(db_events): from mapreduce import context ctx = context.get() params = ctx.mapreduce_spec.mapper.params token_nickname = params.get('token_nickname') db_events = [ x for x in db_events if x.actual_city_name and x.actual_city_name.endswith('Japan') ] for db_event in db_events: pubsub.eventually_publish_event(db_event.id, token_nickname)
def map(cls, event): """Extract question responses from all event types providing them.""" if event.source not in ( 'submit-assessment', 'attempt-lesson', 'tag-assessment'): return # Fetch global params set up in build_additional_mapper_params(), above. params = context.get().mapreduce_spec.mapper.params questions_info = params['questions_by_usage_id'] valid_question_ids = params['valid_question_ids'] group_to_questions = params['group_to_questions'] assessment_weights = params['assessment_weights'] timestamp = int( (event.recorded_on - datetime.datetime(1970, 1, 1)).total_seconds()) content = transforms.loads(event.data) if event.source == 'submit-assessment': answer_data = content.get('values', {}) # TODO(mgainer): handle assessment-as-form submissions. Current # implementation only understands Question and QuestionGroup; # forms are simply submitted as lists of fields. # TODO(mgainer): Handle peer-review scoring if not isinstance(answer_data, dict): return version = answer_data.get('version') if version == '1.5': answers = event_transforms.unpack_student_answer_1_5( questions_info, valid_question_ids, assessment_weights, group_to_questions, answer_data, timestamp) elif event.source == 'attempt-lesson': # Very odd that the version should be in the answers map.... version = content.get('answers', {}).get('version') if version == '1.5': answers = event_transforms.unpack_student_answer_1_5( questions_info, valid_question_ids, assessment_weights, group_to_questions, content, timestamp) elif event.source == 'tag-assessment': answers = event_transforms.unpack_check_answers( content, questions_info, valid_question_ids, assessment_weights, group_to_questions, timestamp) yield (RawAnswersGenerator.TOTAL_STUDENTS, event.user_id) # Each answer is a namedtuple; convert to a list for pack/unpack # journey through the map/reduce shuffle stage. result = [list(answer) for answer in answers] for key in cls._generate_keys(event, event.user_id): yield (key, result)
def _entity_created_before_job_queued(entity): """Checks that the given entity was created before the MR job was queued. Mapper methods may want to use this as a precomputation check, especially if the datastore classes being iterated over are append-only event logs. """ created_on_msec = utils.get_time_in_millisecs(entity.created_on) job_queued_msec = float(context.get().mapreduce_spec.mapper.params[ MAPPER_PARAM_KEY_QUEUED_TIME_MSECS]) return job_queued_msec >= created_on_msec
def write(self, data): ctx = context.get() pg_pool = ctx.get_pool('postgres_pool') if not pg_pool: pg_pool = _PostgresPool(ctx=ctx, host=self.host, port=self.port, database=self.database, user=self.user, password=self.password) ctx.register_pool('postgres_pool', pg_pool) pg_pool.append(data)
def run(self, key, values): if not self._combiner: ctx = context.get() params = ctx.mapreduce_spec.mapper.params combine_spec = params.get(_CombinePipeline.COMBINE_SPEC_PARAM) self._combiner = util.for_name(combine_spec) for combined_value in self._combiner(key, values, []): proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(combined_value) yield proto.Encode()
def run(self, sub): if sub.subscription_state != main.Subscription.STATE_VERIFIED: return if self.threshold_timestamp is None: params = context.get().mapreduce_spec.mapper.params self.threshold_timestamp = datetime.datetime.utcfromtimestamp( float(params['threshold_timestamp'])) if sub.expiration_time < self.threshold_timestamp: sub.request_insert(sub.callback, sub.topic, sub.verify_token, sub.secret, auto_reconfirm=True)
def write(self, data): """Write data to the GoogleCloudStorage file. Args: data: string containing the data to be written. """ start_time = time.time() self._streaming_buffer.write(data) ctx = context.get() operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx) operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
def log2csv(l): """Convert log API RequestLog object to csv.""" root_pipeline_id = context.get( ).mapreduce_spec.mapper.params['root_pipeline_id'] message( root_pipeline_id, '<span class="label label-warning">pending</span> MapperPipeline.log2csv' ) yield '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"\n' % ( l.start_time, l.method, l.resource, l.status, l.latency, l.response_size, l.was_loading_request, l.cost, l.user_agent if l.user_agent else "NULL", l.nickname if l.nickname else "NULL")
def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner( self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: # with combiner current values always come from combiner self.current_values.append(value) else: # without combiner we just accumulate values. self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values # This is final value, don't try to serialize it. self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def reduceProcess(data_id, entities): # TODO: (Aruna) Fix these import from melange.logic import cached_list from melange.utils import lists ctx = context.get() params = ctx.mapreduce_spec.mapper.params list_id = params['list_id'] ndb.transaction(lambda: cached_list.setCacheItems( data_id, map(json.loads, entities), lists.getList(list_id).valid_period))