Example #1
0
 def testGetSetContext(self):
     """Test module's get_context and _set functions."""
     ctx = context.Context(None, None)
     self.assertFalse(context.get())
     context.Context._set(ctx)
     self.assertEquals(ctx, context.get())
     context.Context._set(None)
     self.assertEquals(None, context.get())
 def testGetSetContext(self):
     """Test module's get_context and _set functions."""
     ctx = context.Context(None, None)
     self.assertFalse(context.get())
     context.Context._set(ctx)
     self.assertEquals(ctx, context.get())
     context.Context._set(None)
     self.assertEquals(None, context.get())
Example #3
0
def discover_events_from_sources(fbl, sources):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        min_potential_events = params.get('min_potential_events', 0)
        sources = [
            x for x in sources
            if min_potential_events <= (x.num_potential_events or 0)
        ]

    # don't scrape sources that prove useless and give mostly garbage events
    #sources = [x for x in sources if x.fraction_potential_are_real() > 0.05]

    logging.info("Looking up sources: %s", [x.graph_id for x in sources])
    fbl.request_multi(fb_api.LookupThingFeed, [x.graph_id for x in sources])
    fbl.batch_fetch()

    logging.info("Fetched %s URLs, saved %s updates", fbl.fb_fetches,
                 fbl.db_updates)

    discovered_list = set()
    for source in sources:
        try:
            thing_feed = fbl.fetched_data(fb_api.LookupThingFeed,
                                          source.graph_id)
            discovered_list.update(process_thing_feed(source, thing_feed))
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
def index_map(data):
    """Index map function.

    Args:
        data: Refers to a line from the input files. It is actually composed
        of a tuple (lineinfo, line). This is the return value from the
        ZipLineInputReader, available among the input readers of mapreduce.

    Yields:
        A tuple in the format <word>_SEP<title>_SEP<character>,
        <serial_line_key>.
        line_key needs to be serialized because it is an object and reduce
        expects strings as input.
    """
    info, line = data
    if line.strip() == '':
        return
    _, file_index, offset = info
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    metadata_blob = params['metadata']
    blob_reader = blobstore.BlobReader(metadata_blob[0].split('/')[-1])
    all_meta = blob_reader.read().split('\n')[:-1] # the last one is empty
    metadata = select_metadata(all_meta, file_index)
    char_map = metadata['pos_to_char']
    sorted_offsets = metadata['sorted_offsets']
    character = get_character(char_map, sorted_offsets, offset)
    title = metadata['title']
    line_db = Line(line=line)
    line_key = line_db.put()
    for word in get_words(line.lower()):
        yield (word + _SEP + title + _SEP + character, pickle.dumps(line_key))
Example #5
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filenames)
    pool_name = "kv_pool%d" % file_index
    filename = self._filenames[file_index]

    if ctx.get_pool(pool_name) is None:
      ctx.register_pool(pool_name,
                        output_writers.RecordsPool(filename=filename, ctx=ctx))
    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    ctx.get_pool(pool_name).append(proto.Encode())
Example #6
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filehandles)

    # Work-around: Since we don't have access to the context in the to_json()
    # function, but we need to flush each pool before we serialize the
    # filehandle, we rely on a member variable instead of using context for
    # pool management.
    pool = self._pools[file_index]
    if pool is None:
      filehandle = self._filehandles[file_index]
      pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx)
      self._pools[file_index] = pool

    proto = kv_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    pool.append(proto.Encode())
Example #7
0
    def map(entity):
        mapper_params = context.get().mapreduce_spec.mapper.params
        namespace = mapper_params['course']
        unit_id = mapper_params['unit_id']
        content = mapper_params['content']
        settings = mapper_params['settings']

        app_context = sites.get_app_context_for_namespace(namespace)
        course = courses.Course(None, app_context=app_context)
        unit = course.find_unit_by_id(str(unit_id))

        submitted_contents = student_work.Submission.get_contents(
            unit.unit_id, entity.get_key())
        if not submitted_contents:
            return
        submission = transforms.loads(submitted_contents)
        if not submission:
            return
        lang = submission.keys()[0]
        code = submission[lang]['code']
        filename = submission[lang]['filename']

        evaluator_id = content.get('evaluator', 'mooshak')
        evaluator_class = evaluator.ProgramEvaluatorRegistory.get(evaluator_id)
        if not evaluator_class:
            return

        old_score = course.get_score(entity, unit.unit_id)
        prog_evaluator = evaluator_class(course, settings, unit, content)
        prog_evaluator.evaluate(entity, False, lang, filename, code)
        new_score = course.get_score(entity, unit.unit_id)
        yield (str(old_score), new_score)
Example #8
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        file_index = key.__hash__() % len(self._filehandles)

        # Work-around: Since we don't have access to the context in the to_json()
        # function, but we need to flush each pool before we serialize the
        # filehandle, we rely on a member variable instead of using context for
        # pool management.
        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = kv_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
Example #9
0
def row_handler(line_details):
    year = context.get().mapreduce_spec.mapper.params.get('student_year')
    pos, line = line_details

    if not pos:
        # skip header
        raise StopIteration()

    if not year:
        logging.error('Failed to create a new student. The year is missing')
        raise StopIteration()
    row = csv.reader(StringIO.StringIO(line)).next()

    if len(row) < 4:
        logging.error(
            'Failed to create a new student. Some fields are missing: %s',
            line)
        raise StopIteration()

    _, surname, name, email = row[0:4]
    try:
        entity = Student.new_student(surname,
                                     name,
                                     email=email,
                                     year=year,
                                     commit=False)
    except (ValueError, TypeError, ValidationError) as e:
        logging.error(
            'Failed to create a new student. wrong fields: %s.\n'
            'Error: %s', line, e)
        raise StopIteration

    yield op.db.Put(entity)
    yield op.counters.Increment('student added/updated', 1)
Example #10
0
def discover_events_from_sources(fbl, sources):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        min_potential_events = params.get('min_potential_events', 0)
        sources = [x for x in sources if min_potential_events <= (x.num_potential_events or 0)]

    # Maybe we can build this into the upfront mapreduce filter?
    # Unfortunately, '!='' is more difficult to do and requires better schema planning,
    # so let's just do this for now.
    # Hopefully this also prevents the API Rate limits on GET {user-id} lookups.
    sources = [x for x in sources if x.graph_type != thing_db.GRAPH_TYPE_PROFILE]

    # don't scrape sources that prove useless and give mostly garbage events
    #sources = [x for x in sources if x.fraction_potential_are_real() > 0.05]

    if fbl.allow_cache:
        logging.error('discover_events_from_sources unexpectedly called with a disabled cache!')

    logging.info("Looking up sources: %s", [x.graph_id for x in sources])
    fbl.request_multi(fb_api.LookupThingCommon, [x.graph_id for x in sources])
    # Now based on the sources we know, grab the appropriate fb data
    for s in sources:
        fbl.request(thing_db.get_lookup_for_graph_type(s.graph_type), s.graph_id)
    fbl.batch_fetch()

    logging.info("Fetched %s URLs, saved %s updates", fbl.fb_fetches, fbl.db_updates)

    discovered_list = set()
    for source in sources:
        try:
            discovered_list.update(_process_thing_feed(fbl, source))
        except fb_api.NoFetchedDataException, e:
            logging.warning("Failed to fetch data for thing: %s", str(e))
Example #11
0
def process(org_app):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params

  program_type = params['program_type']
  program_key_str = params['program_key']

  # now the script is used only for GCI
  if program_type != 'gci':
    return

  program = GCIProgram.get_by_key_name(program_key_str)

  survey_query = OrgAppSurvey.all(keys_only=True).filter('program', program)
  survey_key = survey_query.get()

  # We can skip the survey records not belonging to the given program.
  if org_app.survey.key() != survey_key:
    return

  # TODO(daniel): create a MapReduce/Task RequestData
  data = MapreduceRequestData(program, Site.get_by_key_name('site'))

  absolute_url = links.ABSOLUTE_LINKER.program(
      program, gci_url_names.CREATE_GCI_ORG_PROFILE)

  if org_app.status == 'pre-accepted':
    org_app_logic.setStatus(data, org_app, 'accepted', absolute_url)
    yield operation.counters.Increment("proposals_accepted")
  elif org_app.status == 'pre-rejected':
    org_app_logic.setStatus(data, org_app, 'rejected', absolute_url)
    yield operation.counters.Increment("proposals_rejected")
  else:
    yield operation.counters.Increment("proposals_ignored")
def process(task):
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    program_key = params['program_key']

    try:
        program = GCIProgram.get_by_key_name(program_key)
    except db.BadValueError:
        yield operation.counters.Increment('program_key_is_empty_or_invalid')
        return

    def subscribe_to_task_txn(task_key, subscribe):
        task = GCITask.get(task_key)
        task.subscribers = list(set(task.subscribers + subscribe))
        task.put()
        return task

    if task.program.key() != program.key():
        yield operation.counters.Increment("old_program_task_not_updated")
        return

    mentors = db.get(task.mentors)
    entities = mentors + [task.created_by, task.modified_by]

    subscribe = [
        ent.key() for ent in entities if ent.automatic_task_subscription
    ]

    result = db.run_in_transaction(subscribe_to_task_txn, task.key(),
                                   subscribe)

    if result:
        yield operation.counters.Increment("task_updated")
    else:
        yield operation.counters.Increment("task_not_updated")
Example #13
0
    def map(entity):
        mapper_params = context.get().mapreduce_spec.mapper.params
        namespace = mapper_params['course']
        unit_id = mapper_params['unit_id']
        ignore_order = mapper_params['ignore_order']

        app_context = sites.get_app_context_for_namespace(namespace)
        course = courses.Course(None, app_context=app_context)
        unit = course.find_unit_by_id(str(unit_id))

        if verify.UNIT_TYPE_ASSESSMENT == unit.type:
            grader = unit.workflow.get_grader()
            if grader == courses.AUTO_GRADER:
                pass
            else:
                return
        else:
            return
        enable_negative_marking = unit.enable_negative_marking

        submission = student_work.Submission.get_contents(
            unit.unit_id, entity.get_key())

        if not submission:
            return

        old_score = course.get_score(entity, unit.unit_id)
        new_score = scorer.score_assessment(submission,
                                            unit.html_content,
                                            enable_negative_marking,
                                            ignore_order=ignore_order)
        utils.set_score(entity, unit.unit_id, new_score)
        entity.put()
        yield (str(old_score), new_score)
def create_channel_assignments(entity):
    
    # Get network graph
    
    logging.info('Creating channel assignments')
    
    network_graph = entity.recreate_network_graph()
    
    # Get parameter values
    
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params

    min_links = int(params['min_links'])
    min_neighbors = int(params['min_neighbors'])
    num_channels = int(params['num_channels'])
    
    node_selector = NodeSelector(network_graph, min_links, min_neighbors)    
    selected_nodes = node_selector.select_nodes()
    
    logging.info('Number of selected nodes: ' + str(len(selected_nodes)))
    
    channel_assignment_initializer = ChannelAssignmentInitializer(selected_nodes, num_channels)
    names_to_numbers = channel_assignment_initializer.names_to_numbers
    names_to_numbers_entity = NamesToNumbersModel(names_to_numbers=str(names_to_numbers))
    names_to_numbers_entity.put()
    
    num_nodes = len(names_to_numbers)
    
    for combination in product(range(num_channels), repeat=num_nodes):
        ca_entity = ChannelAssignmentModel(channel_assignment=str(combination))
        ca_entity.put()
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filenames)
    pool_name = "kv_pool%d" % file_index
    filename = self._filenames[file_index]

    if ctx.get_pool(pool_name) is None:
      ctx.register_pool(pool_name,
                        output_writers.RecordsPool(filename=filename, ctx=ctx))
    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    ctx.get_pool(pool_name).append(proto.Encode())
Example #16
0
def build_search_index(readbuffer):
    # readbuffer should be a tuple from GoogleCloudLineInputReader composed of a
    # tuple of the form ((file_name, offset), line)

    # Get namespace from mapreduce job and set it.
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    namespace = params['namespace']
    index_name = params['index_name']
    rightnow=datetime.now()
    today=rightnow.day
    if today < 10:
      day='0%s' % today
    else:
      day='%s' % today
    thismonth=rightnow.month
    if thismonth < 10:
      month='0%s' % thismonth
    else:
      month='%s' % thismonth
    indexdate='%s-%s-%s'  % (rightnow.year, month, day)

    try:
        # Get the row out of the input buffer
        row=readbuffer[1]
        # Create a dictionary from the HEADER and the row
        data = get_rec_dict(dict(zip(HEADER, row.split('\t'))))
#        logging.info('Data from %s offset %s: %s' % (readbuffer[0][0], readbuffer[0][1], data))
        # Create an index document from the row dictionary
        doc = index_record(data, indexdate)
        # Store the document in the given index
        index_doc(doc, index_name, namespace)
    except Exception, e:
        logging.error('%s\n%s' % (e, readbuffer))
def test_user_on_events(user):
    logging.info("Trying user %s (expired %s)", user.fb_uid, user.expired_oauth_token)
    if user.expired_oauth_token:
        return
    # TODO: Relies on us keeping these up to date!
    if not user.num_auto_added_events and not user.num_hand_added_events:
        return
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    event_ids = params['event_ids'].split(',')
    fbl = user.get_fblookup()
    fbl.allow_cache = False
    try:
        fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids)
    except fb_api.ExpiredOAuthToken:
        # Not longer a valid source for access_tokens
        return
    found_fb_events = [x for x in fb_events if x and not x['empty']]
    for fb_event in found_fb_events:
        yield (fb_event['info']['id'], user.fb_uid)

    # Found some good stuff, let's save and update the db events
    found_db_events = eventdata.DBEvent.get_by_ids([x['info']['id'] for x in found_fb_events])
    db_fb_events = []
    for db_event, new_fb_event in zip(found_db_events, found_fb_events):
        if db_event.has_content():
            db_fb_events.append((db_event, new_fb_event))
    event_updates.update_and_save_fb_events(db_fb_events)
Example #18
0
 def __call__(self, entity):
     self.slice_count += 1
     yield "%s\n" % entity.int_property
     slice_id = context.get()._shard_state.slice_id
     # Raise exception when processing the 2 item in a slice every 3 slices.
     if (self.slice_count == 2 and (slice_id + 1) % 3 == 0):
         raise Exception("Intentionally raise an exception")
Example #19
0
def delete_for_program(entity):
    """Permanently delete anything in the datastore from a program.

    Works with Projects, ProjectCohorts, and Surveys. Also requires a param set
    in mapreduce.yaml: `program_label`.
    """
    params = context.get().mapreduce_spec.mapper.params
    if getattr(entity, 'program_label', None) != params['program_label']:
        return

    # If this is a project cohort, delete the Unique entity that serves as an
    # index of participation codes.
    key_name = ProjectCohort.uniqueness_key(getattr(entity, 'code', ''))
    unique_entity = ndb.Key('Unique', key_name).get()
    if unique_entity:
        yield op.db.Delete(unique_entity)

    # Some entities have tasks in their entity group. There's no convenient
    # way to query tasks directly, so delete them while we're handling their
    # parent.
    # Bypass DatastoreModel to make sure we get soft-deleted entities.
    for task in Task.query(ancestor=entity.key).iter():
        yield op.db.Delete(task)

    yield op.db.Delete(entity)
def process(comment):
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    program_key = params['program_key']

    program = GCIProgram.get_by_key_name(program_key)

    if comment.parent().program.key() != program.key():
        yield operation.counters.Increment(
            "prev_program_comment_not_converted")
        return

    if comment.title not in ACTION_TITLES:
        yield operation.counters.Increment("user_comment_not_converted")
        return

    comment_title = ACTION_TITLES[comment.title]

    changes = ACTION_TITLES[comment_title]
    # Task reopening is a special case which could have been performed
    # either by a mentor or by the automated system after the passing of
    # the deadline. So additional inference of the user has to be made.
    if comment_title == 'Task Reopened':
        if comment.created_by:
            user_info = ugettext('User-Mentor')
        else:
            user_info = ugettext('MelangeAutomatic')
        changes = [user_info] + changes

    comment.changes = changes

    yield operation.db.Put(comment)
    yield operation.counters.Increment("action_comment_converted")
Example #21
0
def process(entity):
  ctx=context.get()
  params=ctx.mapreduce_spec.mapper.params
  dbid=params['dbid']
  viewName=params['viewName']
  viewUrl=params['nodeUrl']+'/'+viewName
  map(entity, dbid, viewName, viewUrl)
def test_user_on_events(user):
    logging.info("Trying user %s (expired %s)", user.fb_uid,
                 user.expired_oauth_token)
    if user.expired_oauth_token:
        return
    # TODO: Relies on us keeping these up to date!
    if not user.num_auto_added_events and not user.num_hand_added_events:
        return
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    event_ids = params['event_ids'].split(',')
    fbl = user.get_fblookup()
    fbl.allow_cache = False
    try:
        fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids)
    except fb_api.ExpiredOAuthToken:
        # Not longer a valid source for access_tokens
        return
    found_fb_events = [x for x in fb_events if x and not x['empty']]
    for fb_event in found_fb_events:
        yield (fb_event['info']['id'], user.fb_uid)

    # Found some good stuff, let's save and update the db events
    found_db_events = eventdata.DBEvent.get_by_ids(
        [x['info']['id'] for x in found_fb_events])
    db_fb_events = []
    for db_event, new_fb_event in zip(found_db_events, found_fb_events):
        if db_event.has_content():
            db_fb_events.append((db_event, new_fb_event))
    event_updates.update_and_save_fb_events(db_fb_events)
def process(task):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params
  program_key = params['program_key']

  try:
    program = GCIProgram.get_by_key_name(program_key)
  except db.BadValueError:
    yield operation.counters.Increment('program_key_is_empty_or_invalid')
    return

  def subscribe_to_task_txn(task_key, subscribe):
    task = GCITask.get(task_key)
    task.subscribers = list(set(task.subscribers + subscribe))
    task.put()
    return task

  if task.program.key() != program.key():
    yield operation.counters.Increment("old_program_task_not_updated")
    return

  mentors = db.get(task.mentors)
  entities = mentors + [task.created_by, task.modified_by]

  subscribe = [ent.key() for ent in entities if ent.automatic_task_subscription]

  result = db.run_in_transaction(subscribe_to_task_txn, task.key(), subscribe)

  if result:
    yield operation.counters.Increment("task_updated")
  else:
    yield operation.counters.Increment("task_not_updated")
def reduce_hpo_metric_date_deltas_to_all_date_counts(reducer_key, reducer_values, now=None):
  """Emits hpoId|participant_type|metric|date|count for each date until today.
  Args:
    reducer_key: hpoId|participant_type|metric
    reducer_values: list of date|delta strings
    now: use to set the clock for testing
  """
  delta_map = {}
  sum_deltas(reducer_values, delta_map)
  # Walk over the deltas by date
  last_date = None
  count = 0
  one_day = timedelta(days=1)
  now = now or context.get().mapreduce_spec.mapper.params.get('now')
  for date_str, delta in sorted(delta_map.items()):
    date = datetime.strptime(date_str, DATE_FORMAT).date()
    if date > now.date():
      # Ignore any data after the current run date.
      break
    # Yield results for all the dates in between
    if last_date:
      middle_date = last_date + one_day
      while middle_date < date:
        yield reduce_result_value(reducer_key, middle_date.isoformat(), count)
        middle_date = middle_date + one_day
    count += delta
    if count > 0:
      yield reduce_result_value(reducer_key, date_str, count)
    last_date = date
  # Yield results up until today.
  if count > 0 and last_date:
    last_date = last_date + one_day
    while last_date <= now.date():
      yield reduce_result_value(reducer_key, last_date.isoformat(), count)
      last_date = last_date + one_day
def reduce_hpo_date_metric_counts_to_database_buckets(reducer_key, reducer_values, version_id=None):
  """Emits a metrics bucket with counts for metrics for a given hpoId + date to SQL
  Args:
     reducer_key: hpoId|date ('*' for hpoId for cross-HPO counts)
     reducer_values: list of participant_type|metric|count strings
  """
  metrics_dict = collections.defaultdict(lambda: 0)
  (hpo_id, date_str) = parse_tuple(reducer_key)
  if hpo_id == '*':
    hpo_id = ''
  date = datetime.strptime(date_str, DATE_FORMAT)
  for reducer_value in reducer_values:
    (participant_type, metric_key, count) = parse_tuple(reducer_value)
    if metric_key == PARTICIPANT_KIND:
      if participant_type == _REGISTERED_PARTICIPANT:
        metrics_dict[metric_key] += int(count)
    else:
      kind = FULL_PARTICIPANT_KIND if participant_type == _FULL_PARTICIPANT else PARTICIPANT_KIND
      metrics_dict['%s.%s' % (kind, metric_key)] += int(count)

  version_id = version_id or context.get().mapreduce_spec.mapper.params.get('version_id')
  bucket = MetricsBucket(metricsVersionId=version_id,
                         date=date,
                         hpoId=hpo_id,
                         metrics=json.dumps(metrics_dict))
  # Use upsert here; when reducer shards retry, we will just replace any metrics bucket that was
  # written before, rather than failing.
  MetricsBucketDao().upsert(bucket)
def process(comment):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params
  program_key = params['program_key']

  program = GCIProgram.get_by_key_name(program_key)

  if comment.parent().program.key() != program.key():
    yield operation.counters.Increment("prev_program_comment_not_converted")
    return

  if comment.title not in ACTION_TITLES:
    yield operation.counters.Increment("user_comment_not_converted")
    return

  comment_title = ACTION_TITLES[comment.title]

  changes = ACTION_TITLES[comment_title]
  # Task reopening is a special case which could have been performed
  # either by a mentor or by the automated system after the passing of
  # the deadline. So additional inference of the user has to be made.
  if comment_title == 'Task Reopened':
    if comment.created_by:
      user_info = ugettext('User-Mentor')
    else:
      user_info = ugettext('MelangeAutomatic')
    changes = [user_info] + changes

  comment.changes = changes

  yield operation.db.Put(comment)
  yield operation.counters.Increment("action_comment_converted")
Example #27
0
  def run(self, event):
    if not self.oldest_last_modified:
      params = context.get().mapreduce_spec.mapper.params
      self.oldest_last_modified = datetime.datetime.utcfromtimestamp(
          params['oldest_last_modified'])

    if event.last_modified < self.oldest_last_modified:
      yield op.db.Delete(event)
def yield_post_jp_event(db_events):
    from mapreduce import context
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    token_nickname = params.get('token_nickname')
    db_events = [x for x in db_events if x.actual_city_name and x.actual_city_name.endswith('Japan')]
    for db_event in db_events:
        pubsub.eventually_publish_event(db_event.id, token_nickname)
Example #29
0
    def run(self, event):
        if not self.oldest_last_modified:
            params = context.get().mapreduce_spec.mapper.params
            self.oldest_last_modified = datetime.datetime.utcfromtimestamp(
                params['oldest_last_modified'])

        if event.last_modified < self.oldest_last_modified:
            yield op.db.Delete(event)
 def __call__(self, entity):
   self.slice_count += 1
   yield "%s\n" % entity.int_property
   slice_id = context.get()._shard_state.slice_id
   # Raise exception when processing the 2 item in a slice every 3 slices.
   if (self.slice_count == 2 and
       (slice_id + 1) % 3 == 0):
     raise Exception("Intentionally raise an exception")
Example #31
0
 def map(item):
     mapper_params = context.get().mapreduce_spec.mapper.params
     user_ids_to_remove = set(mapper_params['user_ids'])
     item_user_ids = set(item.get_user_ids())
     matching = item_user_ids.intersection(user_ids_to_remove)
     if matching:
         item.delete()
         for user_id in matching:
             yield user_id, 1
Example #32
0
def word_count_reduce(key, values):
  """Word count reduce function."""
  sentences = []
  ctx = context.get()
  title = ctx.mapreduce_spec.mapper.params['output_writer']['book_title']
  for sentence in values:
    sentences.append(models.Sentence(sentence=sentence, book=title))
  word = models.Word(word=key, sentences=sentences)
  word.put()
Example #33
0
def log2csv(l):
  """Convert log API RequestLog object to csv."""
  root_pipeline_id = context.get().mapreduce_spec.mapper.params['root_pipeline_id']
  message(root_pipeline_id, '<span class="label label-warning">pending</span> MapperPipeline.log2csv')
  yield '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (l.start_time, l.method, l.resource,
                                             l.status, l.latency, l.response_size,
                                             l.was_loading_request, l.cost,
                                             '"%s"' % l.user_agent if l.user_agent else "NULL",
                                             l.nickname if l.nickname else "NULL")
Example #34
0
def denorm_entity_mapper(entity):

    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params

    entity._denorm_values = params['denorm_values']

    # Instead of naive single save: entity.save(), do the following more efficient batch save:
    yield _batch_save(entity)
def yield_maybe_delete_bad_event(fbl, db_event):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        allow_deletes = params['allow_deletes']
    else:
        allow_deletes = False

    if db_event.creating_method not in [
            eventdata.CM_AUTO_ATTENDEE, eventdata.CM_AUTO
    ]:
        return

    if db_event.fb_event['empty']:
        return

    import datetime
    # This is when we started adding all sorts of "crap"
    if not db_event.creation_time or db_event.creation_time < datetime.datetime(
            2016, 3, 5):
        return

    logging.info('MDBE: Check on event %s: %s', db_event.id,
                 db_event.creating_method)
    from dancedeets.event_scraper import auto_add
    from dancedeets.nlp import event_classifier
    classified_event = event_classifier.get_classified_event(db_event.fb_event)
    good_text_event = auto_add.is_good_event_by_text(db_event.fb_event,
                                                     classified_event)
    if good_text_event:
        if db_event.creating_method != eventdata.CM_AUTO:
            db_event.creating_method = eventdata.CM_AUTO
            yield op.db.Put(db_event)
    else:
        good_event = event_attendee_classifier.is_good_event_by_attendees(
            fbl, db_event.fb_event, classified_event=classified_event)
        if good_event:
            if db_event.creating_method != eventdata.CM_AUTO_ATTENDEE:
                db_event.creating_method = eventdata.CM_AUTO_ATTENDEE
                yield op.db.Put(db_event)
        else:
            logging.info('Accidentally %s added event %s: %s: %s',
                         db_event.creating_method, db_event.fb_event_id,
                         db_event.country, db_event.name)
            mr.increment('deleting-bad-event')
            result = '%s: %s: %s: %s\n' % (db_event.fb_event_id,
                                           db_event.creating_method,
                                           db_event.country, db_event.name)
            yield result.encode('utf-8')
            if allow_deletes:
                from dancedeets.search import search
                search.delete_from_fulltext_search_index(db_event.fb_event_id)
                yield op.db.Delete(db_event)
                display_event = search.DisplayEvent.get_by_id(
                    db_event.fb_event_id)
                if display_event:
                    yield op.db.Delete(display_event)
Example #36
0
 def map(item):
     mapper_params = context.get().mapreduce_spec.mapper.params
     user_ids_to_remove = set(mapper_params["user_ids"])
     item_user_ids = set(item.get_user_ids())
     matching = item_user_ids.intersection(user_ids_to_remove)
     if matching:
         item.delete()
         for user_id in matching:
             yield user_id, 1
Example #37
0
 def map(student):
     params = context.get().mapreduce_spec.mapper.params
     ns = params['course_namespace']
     app_context = sites.get_course_index().get_app_context_for_namespace(ns)
     course = courses.Course(None, app_context=app_context)
     if student_is_qualified(student, course):
         yield(TOTAL_CERTIFICATES, 1)
     if student.scores:
         yield(TOTAL_ACTIVE_STUDENTS, 1)
     yield(TOTAL_STUDENTS, 1)
Example #38
0
 def map(student):
     params = context.get().mapreduce_spec.mapper.params
     ns = params['course_namespace']
     app_context = sites.get_course_index().get_app_context_for_namespace(ns)
     course = courses.Course(None, app_context=app_context)
     if student_is_qualified(student, course):
         yield(TOTAL_CERTIFICATES, 1)
     if student.scores:
         yield(TOTAL_ACTIVE_STUDENTS, 1)
     yield(TOTAL_STUDENTS, 1)
Example #39
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if ctx.get_pool("file_pool") is None:
      ctx.register_pool("file_pool", _FilePool(ctx=ctx))
    ctx.get_pool("file_pool").append(self._filename, str(data))
Example #40
0
    def map(entity):
        mapper_params = context.get().mapreduce_spec.mapper.params
        if entity.removed or entity.state == staff.REVIEW_STATE_COMPLETED:
            return

        course_staff_user_ids = mapper_params['course_staff_user_ids']
        if entity.evaluator not in course_staff_user_ids:
            return
        remove_step(entity)
        assign_course_staff(entity)
Example #41
0
    def run(self, subscription):
        if self.topic_pattern is None:
            params = context.get().mapreduce_spec.mapper.params
            self.topic_pattern = re.compile(params['topic_pattern'])
            self.callback_pattern = re.compile(params['callback_pattern'])

        if self.topic_pattern.match(subscription.topic):
            the_match = self.callback_pattern.match(subscription.callback)
            if the_match:
                yield op.counters.Increment(the_match.group(1))
Example #42
0
  def run(self, subscription):
    if self.topic_pattern is None:
      params = context.get().mapreduce_spec.mapper.params
      self.topic_pattern = re.compile(params['topic_pattern'])
      self.callback_pattern = re.compile(params['callback_pattern'])

    if self.topic_pattern.match(subscription.topic):
      the_match = self.callback_pattern.match(subscription.callback)
      if the_match:
        yield op.counters.Increment(the_match.group(1))
def process(org_key):
  """Processes a single organization.

  Organization status is updated to ACCEPTED or REJECTED if the current
  status has been set to PRE_ACCEPTED or PRE_REJECTED, respectively,
  by program administrators.

  Args:
    org_key: Organization key.
  """
  context = mapreduce_context.get()
  program_key = db.Key(context.mapreduce_spec.mapper.params['program_key'])

  if program_key.kind() == 'GSoCProgram':
    url_names = soc_urls.UrlNames
  elif program_key.kind() == 'GCIProgram':
    url_names = ci_urls.UrlNames
  else:
    raise ValueError('Invalid program type %s' % program_key.kind())

  program = db.get(program_key)

  site = site_logic.singleton()

  org_key = ndb.Key.from_old_key(org_key)
  org_admins = profile_logic.getOrgAdmins(org_key)

  # We are "prefetching" the ProgramMessages entity here instead of fetching
  # it where it is required i.e. when the message templates are required
  # to build the email message body. We do this because we perform the
  # operation of fetching the ProgramMessages entity if it exists or create
  # it if it doesn't in a Appengine regular "db" transation whereas rest
  # of the updating of organization entities happen within an ndb transaction
  # because Organization model is an ndb model and such cross API nested
  # transactions are incompatible in Appengine.
  program_messages = program.getProgramMessages()

  @ndb.transactional
  def updateOrganizationStatus():
    """Transactionally updates organization status."""
    # only organizations defined for the specified program should be processed
    organization = org_key.get()
    if organization.program.to_old_key() == program_key:
      if organization.status == org_model.Status.PRE_ACCEPTED:
        org_logic.setStatus(
            organization, program, site, program_messages,
            org_model.Status.ACCEPTED, links.ABSOLUTE_LINKER, url_names,
            org_admins=org_admins)
      elif organization.status == org_model.Status.PRE_REJECTED:
        org_logic.setStatus(
            organization, program, site, program_messages,
            org_model.Status.REJECTED, links.ABSOLUTE_LINKER, url_names,
            org_admins=org_admins)

  updateOrganizationStatus()
def yield_load_fb_event(fbl, all_events):
    ctx = context.get()
    if ctx:
        params = ctx.mapreduce_spec.mapper.params
        update_geodata = params['update_geodata']
        only_if_updated = params['only_if_updated']
    else:
        update_geodata = True
        only_if_updated = True

    # Process web_events
    web_events = [x for x in all_events if not x.is_fb_event]
    events_to_update = []
    for web_event in web_events:
        if event_updates.need_forced_update(web_event):
            events_to_update.append((web_event, web_event.web_event))
    event_updates.update_and_save_web_events(events_to_update, update_geodata=update_geodata)

    # Now process fb_events
    db_events = [x for x in all_events if x.is_fb_event]
    logging.info("loading db events %s", [db_event.fb_event_id for db_event in db_events])
    fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in db_events])
    # fbl.request_multi(fb_api.LookupEventPageComments, [x.fb_event_id for x in db_events])
    fbl.batch_fetch()
    events_to_update = []
    empty_fb_event_ids = []
    for db_event in db_events:
        try:
            real_fb_event = fbl.fetched_data(fb_api.LookupEvent, db_event.fb_event_id)
            # If it's an empty fb_event with our main access token, and we have other tokens we'd like to try...
            # If there are no visible_to_fb_uids and we don't have permissions, then we don't do this...
            #
            # TODO: This would happen on event deletion?
            #
            # TODO: Also, who sets visible_to_fb_uids? Why didn't this event have any?
            # TODO: Who re-sets visible_to_fb_uids after it goes empty? Can we ensure that keeps going?
            #
            # TODO: And what happens if we have a deleted event, with visible_to_fb_uids, that we attempt to run and query, and nothing happens?
            # Should we distinguish between deleted (and inaccessible) and permissions-lost-to-token (and inaccessible)?
            #
            # TODO: Why doesn't this update the event? Because add_event_tuple_if_updating seems to do nothing, probably because no fb_event is returned
            if real_fb_event['empty'] == fb_api.EMPTY_CAUSE_INSUFFICIENT_PERMISSIONS and db_event.visible_to_fb_uids:
                empty_fb_event_ids.append(db_event.fb_event_id)
            else:
                # Otherwise if it's visible to our main token, or there are no other tokens to try, deal with it here.
                add_event_tuple_if_updating(events_to_update, fbl, db_event, only_if_updated)
        except fb_api.NoFetchedDataException as e:
            logging.info("No data fetched for event id %s: %s", db_event.fb_event_id, e)
    # Now trigger off a background reloading of empty fb_events
    if empty_fb_event_ids:
        logging.info("Couldn't fetch, using backup tokens for events: %s", empty_fb_event_ids)
        deferred.defer(load_fb_events_using_backup_tokens, empty_fb_event_ids, allow_cache=fbl.allow_cache, only_if_updated=only_if_updated, update_geodata=update_geodata)
    logging.info("Updating events: %s", [x[0].id for x in events_to_update])
    # And then re-save all the events in here
    event_updates.update_and_save_fb_events(events_to_update, update_geodata=update_geodata)
Example #45
0
    def _entity_created_before_job_queued(entity):
        """Checks that the given entity was created before the MR job was queued.

        Mapper methods may want to use this as a precomputation check,
        especially if the datastore classes being iterated over are append-only
        event logs.
        """
        created_on_msec = utils.get_time_in_millisecs(entity.created_on)
        job_queued_msec = float(context.get().mapreduce_spec.mapper.
                                params[MAPPER_PARAM_KEY_QUEUED_TIME_MSECS])
        return job_queued_msec >= created_on_msec
Example #46
0
    def write(self, data):
        """Write data to the GoogleCloudStorage file.

    Args:
      data: string containing the data to be written.
    """
        start_time = time.time()
        self._streaming_buffer.write(data)
        ctx = context.get()
        operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx)
        operation.counters.Increment(COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
Example #47
0
def yield_post_jp_event(db_events):
    from mapreduce import context
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    token_nickname = params.get('token_nickname')
    db_events = [
        x for x in db_events
        if x.actual_city_name and x.actual_city_name.endswith('Japan')
    ]
    for db_event in db_events:
        pubsub.eventually_publish_event(db_event.id, token_nickname)
Example #48
0
    def map(cls, event):
        """Extract question responses from all event types providing them."""

        if event.source not in (
            'submit-assessment',
            'attempt-lesson',
            'tag-assessment'):
            return

        # Fetch global params set up in build_additional_mapper_params(), above.
        params = context.get().mapreduce_spec.mapper.params
        questions_info = params['questions_by_usage_id']
        valid_question_ids = params['valid_question_ids']
        group_to_questions = params['group_to_questions']
        assessment_weights = params['assessment_weights']

        timestamp = int(
            (event.recorded_on - datetime.datetime(1970, 1, 1)).total_seconds())
        content = transforms.loads(event.data)

        if event.source == 'submit-assessment':
            answer_data = content.get('values', {})
            # TODO(mgainer): handle assessment-as-form submissions.  Current
            # implementation only understands Question and QuestionGroup;
            # forms are simply submitted as lists of fields.
            # TODO(mgainer): Handle peer-review scoring
            if not isinstance(answer_data, dict):
                return
            version = answer_data.get('version')
            if version == '1.5':
                answers = event_transforms.unpack_student_answer_1_5(
                    questions_info, valid_question_ids, assessment_weights,
                    group_to_questions, answer_data, timestamp)

        elif event.source == 'attempt-lesson':
            # Very odd that the version should be in the answers map....
            version = content.get('answers', {}).get('version')
            if version == '1.5':
                answers = event_transforms.unpack_student_answer_1_5(
                    questions_info, valid_question_ids, assessment_weights,
                    group_to_questions, content, timestamp)

        elif event.source == 'tag-assessment':
            answers = event_transforms.unpack_check_answers(
                content, questions_info, valid_question_ids, assessment_weights,
                group_to_questions, timestamp)

        yield (RawAnswersGenerator.TOTAL_STUDENTS, event.user_id)

        # Each answer is a namedtuple; convert to a list for pack/unpack
        # journey through the map/reduce shuffle stage.
        result = [list(answer) for answer in answers]
        for key in cls._generate_keys(event, event.user_id):
            yield (key, result)
Example #49
0
    def _entity_created_before_job_queued(entity):
        """Checks that the given entity was created before the MR job was queued.

        Mapper methods may want to use this as a precomputation check,
        especially if the datastore classes being iterated over are append-only
        event logs.
        """
        created_on_msec = utils.get_time_in_millisecs(entity.created_on)
        job_queued_msec = float(context.get().mapreduce_spec.mapper.params[
            MAPPER_PARAM_KEY_QUEUED_TIME_MSECS])
        return job_queued_msec >= created_on_msec
 def write(self, data):
   ctx = context.get()
   pg_pool = ctx.get_pool('postgres_pool')
   if not pg_pool:
     pg_pool = _PostgresPool(ctx=ctx,
                             host=self.host,
                             port=self.port,
                             database=self.database,
                             user=self.user,
                             password=self.password)
     ctx.register_pool('postgres_pool', pg_pool)
   pg_pool.append(data)
Example #51
0
  def run(self, key, values):
    if not self._combiner:
      ctx = context.get()
      params = ctx.mapreduce_spec.mapper.params
      combine_spec = params.get(_CombinePipeline.COMBINE_SPEC_PARAM)
      self._combiner = util.for_name(combine_spec)

    for combined_value in self._combiner(key, values, []):
      proto = file_service_pb.KeyValue()
      proto.set_key(key)
      proto.set_value(combined_value)
      yield proto.Encode()
Example #52
0
  def run(self, sub):
    if sub.subscription_state != main.Subscription.STATE_VERIFIED:
      return

    if self.threshold_timestamp is None:
      params = context.get().mapreduce_spec.mapper.params
      self.threshold_timestamp = datetime.datetime.utcfromtimestamp(
          float(params['threshold_timestamp']))

    if sub.expiration_time < self.threshold_timestamp:
      sub.request_insert(sub.callback, sub.topic, sub.verify_token,
                         sub.secret, auto_reconfirm=True)
Example #53
0
  def write(self, data):
    """Write data to the GoogleCloudStorage file.

    Args:
      data: string containing the data to be written.
    """
    start_time = time.time()
    self._streaming_buffer.write(data)
    ctx = context.get()
    operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx)
    operation.counters.Increment(
        COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
Example #54
0
def log2csv(l):
    """Convert log API RequestLog object to csv."""
    root_pipeline_id = context.get(
    ).mapreduce_spec.mapper.params['root_pipeline_id']
    message(
        root_pipeline_id,
        '<span class="label label-warning">pending</span> MapperPipeline.log2csv'
    )
    yield '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"\n' % (
        l.start_time, l.method, l.resource, l.status, l.latency,
        l.response_size, l.was_loading_request, l.cost, l.user_agent
        if l.user_agent else "NULL", l.nickname if l.nickname else "NULL")
Example #55
0
 def write(self, data):
     ctx = context.get()
     pg_pool = ctx.get_pool('postgres_pool')
     if not pg_pool:
         pg_pool = _PostgresPool(ctx=ctx,
                                 host=self.host,
                                 port=self.port,
                                 database=self.database,
                                 user=self.user,
                                 password=self.password)
         ctx.register_pool('postgres_pool', pg_pool)
     pg_pool.append(data)
Example #56
0
  def __iter__(self):
    ctx = context.get()
    combiner = None

    if ctx:
      combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec")
      if combiner_spec:
        combiner = util.handler_for_name(combiner_spec)

    self.current_key = None
    self.current_values = None

    for binary_record in super(_ReducerReader, self).__iter__():
      proto = file_service_pb.KeyValues()
      proto.ParseFromString(binary_record)

      if self.current_key is None:
        self.current_key = proto.key()
        self.current_values = []
      else:
        assert proto.key() == self.current_key, (
            "inconsistent key sequence. Expected %s but got %s" %
            (self.current_key, proto.key()))

      if combiner:
        combiner_result = combiner(
            self.current_key, proto.value_list(), self.current_values)

        if not util.is_generator(combiner_result):
          raise errors.BadCombinerOutputError(
              "Combiner %s should yield values instead of returning them (%s)" %
              (combiner, combiner_result))

        self.current_values = []
        for value in combiner_result:
          if isinstance(value, operation.Operation):
            value(ctx)
          else:
            # with combiner current values always come from combiner
            self.current_values.append(value)
      else:
        # without combiner we just accumulate values.
        self.current_values.extend(proto.value_list())

      if not proto.partial():
        key = self.current_key
        values = self.current_values
        # This is final value, don't try to serialize it.
        self.current_key = None
        self.current_values = None
        yield (key, values)
      else:
        yield input_readers.ALLOW_CHECKPOINT
Example #57
0
def reduceProcess(data_id, entities):
  # TODO: (Aruna) Fix these import
  from melange.logic import cached_list
  from melange.utils import lists

  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params

  list_id = params['list_id']

  ndb.transaction(lambda: cached_list.setCacheItems(
      data_id, map(json.loads, entities), lists.getList(list_id).valid_period))