Esempio n. 1
0
  def _start_map(cls,
                 name,
                 mapper_spec,
                 mapreduce_params,
                 base_path=None,
                 queue_name=None,
                 eta=None,
                 countdown=None,
                 hooks_class_name=None,
                 _app=None,
                 transactional=False,
                 parent_entity=None):
    """See control.start_map."""
    if not transactional and parent_entity:
      raise Exception("Parent shouldn't be specfied "
                      "for non-transactional starts.")


    mapper_input_reader_class = mapper_spec.input_reader_class()
    mapper_input_reader_class.validate(mapper_spec)

    mapper_output_writer_class = mapper_spec.output_writer_class()
    if mapper_output_writer_class:
      mapper_output_writer_class.validate(mapper_spec)

    mapreduce_id = model.MapreduceState.new_mapreduce_id()
    mapreduce_spec = model.MapreduceSpec(
        name,
        mapreduce_id,
        mapper_spec.to_json(),
        mapreduce_params,
        hooks_class_name)


    ctx = context.Context(mapreduce_spec, None)
    context.Context._set(ctx)
    try:

      mapper_spec.handler
    finally:
      context.Context._set(None)

    if not transactional:

      state = model.MapreduceState.create_new(mapreduce_spec.mapreduce_id)
      state.mapreduce_spec = mapreduce_spec
      state.active = True
      state.active_shards = mapper_spec.shard_count
      if _app:
        state.app_id = _app
      config = util.create_datastore_write_config(mapreduce_spec)
      state.put(config=config)
      parent_entity = state

    cls._add_kickoff_task(
        base_path, mapreduce_spec, eta, countdown, parent_entity,
        queue_name, transactional, _app)

    return mapreduce_id
Esempio n. 2
0
  def testProcessNamespace(self):
    """Test ProcessNamespace function."""
    namespace_manager.set_namespace("1")
    TestEntity().put()
    namespace_manager.set_namespace(None)

    namespaces_jobs = utils.RunMapForKinds(
        self.operation,
        [TestEntity.kind()],
        'Test job for %(kind)s%(namespace)s',
        '__main__.foo',
        self.reader_class_spec,
        {'test_param': 1})
    testutil.execute_all_tasks(self.taskqueue)

    m = mox.Mox()
    m.StubOutWithMock(context, "get", use_mock_anything=True)

    ctx = context.Context(
        model.MapreduceState.get_by_job_id(namespaces_jobs[0]).mapreduce_spec,
        None)
    context.get().AndReturn(ctx)
    context.get().AndReturn(ctx)

    m.ReplayAll()
    try:
      jobs = utils.ProcessNamespace('1')
      jobs.extend(utils.ProcessNamespace('1'))
      m.VerifyAll()
    finally:
      m.UnsetStubs()
    testutil.execute_all_tasks(self.taskqueue)

    self.assertEquals(1, len(jobs))
    job = jobs[0]
    state = model.MapreduceState.get_by_job_id(job)
    self.assertTrue(state)

    spec = state.mapreduce_spec
    self.assertTrue(spec)
    self.assertEquals("Test job for TestEntity in namespace 1", spec.name)
    mapper = spec.mapper
    self.assertTrue(mapper)
    self.assertEquals({'test_param': 1,
                       'entity_kind': TestEntity.kind(),
                       'namespaces': '1'},
                      mapper.params)
    self.assertEquals('__main__.foo', mapper.handler_spec)
    self.assertEquals(self.reader_class_spec, mapper.input_reader_spec)
Esempio n. 3
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(tstate.shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])

        if not self._try_acquire_lease(shard_state, tstate):
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)

            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            return

        if ndb is not None:
            ndb_ctx = ndb.get_context()
            ndb_ctx.set_cache_policy(lambda key: False)
            ndb_ctx.set_memcache_policy(lambda key: False)

        context.Context._set(ctx)
        retry_shard = False

        try:
            self.process_inputs(tstate.input_reader, shard_state, tstate, ctx)

            if not shard_state.active:

                if (shard_state.result_status
                        == model.ShardState.RESULT_SUCCESS
                        and tstate.output_writer):

                    tstate.output_writer.finalize(ctx, shard_state)

        except Exception, e:
            retry_shard = self._retry_logic(e, shard_state, tstate,
                                            spec.mapreduce_id)
    def testOp(self):
        """Test AllocateMaxId operation."""
        ctx = context.Context(None, None)
        copy_handler.AllocateMaxId(key('TestEntity', 30), self.app_id)(ctx)

        self.assertEqual(
            {
                ('TestEntity', 1): 30,
            },
            ctx.get_pool('allocate_max_id_test_app_pool').key_path_to_max_id)

        ctx.flush()
        self.assertEqual([
            (key(u'TestEntity', 1, _app=u'test_app'), 1, 30),
        ], self.allocated_id_ranges)
Esempio n. 5
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:

            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:

            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_data(
                            entity, input_reader, ctx, tstate)

                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)

                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            operation.counters.Increment(
                "mapper-walltime-msec",
                int((time.time() - self._start_time) * 1000))(ctx)

            ctx.flush()

            if not shard_state.active:

                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)
            shard_state.put(config=util.create_datastore_write_config(spec))
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        if shard_state.active:
            self.reschedule(shard_state, tstate)
        gc.collect()
Esempio n. 6
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:

            logging.error(
                "State not found for shard %s; Possible spurious task "
                "execution. Dropping this task.", shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard %s is not active. Possible spurious task "
                "execution. Dropping this task.", shard_id)
            logging.error(str(shard_state))
            return
        if shard_state.retries > tstate.retries:
            logging.error(
                "Got shard %s from previous shard retry %s. Possible spurious "
                "task execution. Dropping this task.", shard_id,
                tstate.retries)
            logging.error(str(shard_state))
            return
        elif shard_state.retries < tstate.retries:

            raise ValueError(
                "ShardState for %s is behind slice. Waiting for it to catch up",
                shard_state.shard_id)

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)

            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if ndb is not None:
            ndb_ctx = ndb.get_context()
            ndb_ctx.set_cache_policy(lambda key: False)
            ndb_ctx.set_memcache_policy(lambda key: False)

        context.Context._set(ctx)
        retry_shard = False

        try:
            self.process_inputs(input_reader, shard_state, tstate, ctx)

            if not shard_state.active:

                if (shard_state.result_status
                        == model.ShardState.RESULT_SUCCESS
                        and tstate.output_writer):
                    tstate.output_writer.finalize(ctx, shard_state)

        except Exception, e:
            retry_shard = self._retry_logic(e, shard_state, tstate,
                                            spec.mapreduce_id)
Esempio n. 7
0
    def _start_map(cls,
                   name,
                   mapper_spec,
                   mapreduce_params,
                   base_path=None,
                   queue_name=None,
                   eta=None,
                   countdown=None,
                   hooks_class_name=None,
                   _app=None,
                   transactional=False,
                   parent_entity=None):
        queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME",
                                                  "default")
        if queue_name[0] == "_":

            queue_name = "default"

        if not transactional and parent_entity:
            raise Exception("Parent shouldn't be specfied "
                            "for non-transactional starts.")

        mapper_input_reader_class = mapper_spec.input_reader_class()
        mapper_input_reader_class.validate(mapper_spec)

        mapper_output_writer_class = mapper_spec.output_writer_class()
        if mapper_output_writer_class:
            mapper_output_writer_class.validate(mapper_spec)

        mapreduce_id = model.MapreduceState.new_mapreduce_id()
        mapreduce_spec = model.MapreduceSpec(name, mapreduce_id,
                                             mapper_spec.to_json(),
                                             mapreduce_params,
                                             hooks_class_name)

        ctx = context.Context(mapreduce_spec, None)
        context.Context._set(ctx)
        try:
            mapper_spec.get_handler()
        finally:
            context.Context._set(None)

        kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()}
        if _app:
            kickoff_params["app"] = _app
        kickoff_worker_task = util.HugeTask(url=base_path +
                                            "/kickoffjob_callback",
                                            params=kickoff_params,
                                            eta=eta,
                                            countdown=countdown)

        hooks = mapreduce_spec.get_hooks()
        config = util.create_datastore_write_config(mapreduce_spec)

        def start_mapreduce():
            parent = parent_entity
            if not transactional:

                state = model.MapreduceState.create_new(
                    mapreduce_spec.mapreduce_id)
                state.mapreduce_spec = mapreduce_spec
                state.active = True
                state.active_shards = mapper_spec.shard_count
                if _app:
                    state.app_id = _app
                state.put(config=config)
                parent = state

            if hooks is not None:
                try:
                    hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name)
                except NotImplementedError:

                    pass
                else:
                    return
            kickoff_worker_task.add(queue_name,
                                    transactional=True,
                                    parent=parent)

        if transactional:
            start_mapreduce()
        else:
            db.run_in_transaction(start_mapreduce)

        return mapreduce_id
Esempio n. 8
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:

            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard is not active. Looks like spurious task execution.")
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)

            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:

            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                try:

                    if not quota_consumer or quota_consumer.consume():
                        for entity in input_reader:
                            if isinstance(entity, db.Model):
                                shard_state.last_work_item = repr(entity.key())
                            else:
                                shard_state.last_work_item = repr(entity)[:100]

                            scan_aborted = not self.process_data(
                                entity, input_reader, ctx, tstate)

                            if (quota_consumer and not scan_aborted
                                    and not quota_consumer.consume()):
                                scan_aborted = True
                            if scan_aborted:
                                break
                    else:
                        scan_aborted = True

                    if not scan_aborted:
                        logging.info(
                            "Processing done for shard %d of job '%s'",
                            shard_state.shard_number, shard_state.mapreduce_id)

                        if quota_consumer:
                            quota_consumer.put(1)
                        shard_state.active = False
                        shard_state.result_status = model.ShardState.RESULT_SUCCESS

                    operation.counters.Increment(
                        context.COUNTER_MAPPER_WALLTIME_MS,
                        int((time.time() - self._start_time) * 1000))(ctx)

                    ctx.flush()
                except errors.FailJobError, e:
                    logging.error("Job failed: %s", e)
                    scan_aborted = True
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_FAILED

            if not shard_state.active:

                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)

            config = util.create_datastore_write_config(spec)

            @db.transactional(retries=5)
            def tx():
                fresh_shard_state = db.get(
                    model.ShardState.get_key_by_shard_id(shard_id))
                if (not fresh_shard_state.active
                        or "worker_active_state_collision"
                        in _TEST_INJECTED_FAULTS):
                    shard_state.active = False
                    logging.error(
                        "Spurious task execution. Aborting the shard.")
                    return
                fresh_shard_state.copy_from(shard_state)
                fresh_shard_state.put(config=config)

            tx()
Esempio n. 9
0
  def handle(self):
    """Handle request."""
    tstate = model.TransientShardState.from_request(self.request)
    spec = tstate.mapreduce_spec
    self._start_time = self._time()
    shard_id = tstate.shard_id

    shard_state, control = db.get([
        model.ShardState.get_key_by_shard_id(shard_id),
        model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
    ])
    if not shard_state:


      logging.error("State not found for shard ID %r; shutting down",
                    shard_id)
      return

    if not shard_state.active:
      logging.error("Shard is not active. Looks like spurious task execution.")
      return

    ctx = context.Context(spec, shard_state,
                          task_retry_count=self.task_retry_count())

    if control and control.command == model.MapreduceControl.ABORT:
      logging.info("Abort command received by shard %d of job '%s'",
                   shard_state.shard_number, shard_state.mapreduce_id)


      shard_state.active = False
      shard_state.result_status = model.ShardState.RESULT_ABORTED
      shard_state.put(config=util.create_datastore_write_config(spec))
      model.MapreduceControl.abort(spec.mapreduce_id)
      return

    input_reader = tstate.input_reader

    if spec.mapper.params.get("enable_quota", True):
      quota_consumer = quota.QuotaConsumer(
          quota.QuotaManager(memcache.Client()),
          shard_id,
          _QUOTA_BATCH_SIZE)
    else:
      quota_consumer = None






    if ndb is not None:
      ndb_ctx = ndb.get_context()
      ndb_ctx.set_cache_policy(lambda key: False)
      ndb_ctx.set_memcache_policy(lambda key: False)

    context.Context._set(ctx)
    try:


      if not quota_consumer or quota_consumer.check():
        scan_aborted = False
        entity = None

        try:


          if not quota_consumer or quota_consumer.consume():
            for entity in input_reader:
              if isinstance(entity, db.Model):
                shard_state.last_work_item = repr(entity.key())
              else:
                shard_state.last_work_item = repr(entity)[:100]

              scan_aborted = not self.process_data(
                  entity, input_reader, ctx, tstate)


              if (quota_consumer and not scan_aborted and
                  not quota_consumer.consume()):
                scan_aborted = True
              if scan_aborted:
                break
          else:
            scan_aborted = True

          if not scan_aborted:
            logging.info("Processing done for shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)


            if quota_consumer:
              quota_consumer.put(1)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_SUCCESS

          operation.counters.Increment(
              context.COUNTER_MAPPER_WALLTIME_MS,
              int((time.time() - self._start_time)*1000))(ctx)



          ctx.flush()
        except errors.RetrySliceError, e:
          logging.error("Slice error: %s", e)
          retry_count = int(
              os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0)
          if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES:
            raise
          logging.error("Too many retries: %d, failing the job", retry_count)
          scan_aborted = True
          shard_state.active = False
          shard_state.result_status = model.ShardState.RESULT_FAILED
        except errors.FailJobError, e:
          logging.error("Job failed: %s", e)
          scan_aborted = True
          shard_state.active = False
          shard_state.result_status = model.ShardState.RESULT_FAILED
    def handle(self):
        """Handle request."""
        spec = model.MapreduceSpec.from_json_str(
            self.request.get("mapreduce_spec"))
        self._start_time = self._time()
        shard_id = self.shard_id()

        logging.debug("post: shard=%s slice=%s headers=%s", shard_id,
                      self.slice_id(), self.request.headers)

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = self.input_reader(spec.mapper)

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())
        context.Context._set(ctx)

        try:
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_entity(entity, ctx)

                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            ctx.flush()
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        if shard_state.active:
            self.reschedule(spec, input_reader)