Esempio n. 1
0
    def testCreateWritersWithRetries(self):
        mapreduce_state = self.create_mapreduce_state(
            output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        # Create the writer for the 1st attempt
        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number,
                                        shard_state.retries + 1)
        new_filename = writer._get_filename_for_test()
        writer.begin_slice(None)
        writer.write("initData")
        writer.end_slice(None)

        orig_json = writer.to_json()

        writer = self.WRITER_CLS.from_json(orig_json)
        writer.begin_slice(None)
        writer.write(
            "badData")  # we fail here so this data should be discarded

        # Recreate the same rewrite (simulates a slice retry).
        writer = self.WRITER_CLS.from_json(orig_json)
        writer.begin_slice(None)
        writer.write("goodData")
        writer.end_slice(None)
        writer = self._serialize_and_deserialize(writer)
        writer.finalize(ctx, shard_state)

        # Verify the badData is not in the final file
        self.assertEqual("initDatagoodData",
                         cloudstorage.open(new_filename).read())
  def testCreateWritersWithRetries(self):
    mapreduce_state = self.create_mapreduce_state(
        output_params=
        {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
    shard_state = self.create_shard_state(0)
    ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
    context.Context._set(ctx)

    # Create the writer for the 1st attempt
    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number,
                                    shard_state.retries + 1)
    filename = writer._filename
    writer.write("badData")

    # Test re-creating the writer for a retry
    shard_state.reset_for_retry()
    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number,
                                    shard_state.retries + 1)
    new_filename = writer._filename
    good_data = "goodData"
    writer.write(good_data)
    writer.finalize(None, shard_state)

    # Verify the retry has a different filename
    self.assertNotEqual(filename, new_filename)

    # Verify the badData is not in the final file
    self.assertEqual(good_data, cloudstorage.open(new_filename).read())
  def testWriterSerialization(self):
    mapreduce_state = self.create_mapreduce_state(
        output_params=
        {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
    shard_state = self.create_shard_state(0)
    ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
    context.Context._set(ctx)

    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number, 0)
    # data expliclity contains binary data
    data = "\"fake\"\tdatathatishardtoencode"
    writer.write(data)

    # Serialize/deserialize writer after some data written
    writer = self.WRITER_CLS.from_json(writer.to_json())
    writer.write(data)

    # Serialize/deserialize writer after more data written
    writer = self.WRITER_CLS.from_json(writer.to_json())
    writer.finalize(None, shard_state)

    # Serialize/deserialize writer after finalization
    writer = self.WRITER_CLS.from_json(writer.to_json())
    self.assertRaises(IOError, writer.write, data)

    filename = self.WRITER_CLS._get_filename(shard_state)

    self.assertNotEquals(None, filename)
    self.assertEqual(data + data, cloudstorage.open(filename).read())
Esempio n. 4
0
    def testSmoke(self):
        tmp_files = set()
        final_files = set()
        for shard_num in range(self.NUM_SHARDS):
            shard = self.create_shard_state(shard_num)
            writer = self.WRITER_CLS.create(self.mr_state.mapreduce_spec,
                                            shard.shard_number, 0)
            # Verify files are created under tmp dir.
            tmp_file = writer._streaming_buffer.name
            self.assertTrue(self.WRITER_CLS._MR_TMP in tmp_file)
            tmp_files.add(tmp_file)
            cxt = context.Context(self.mr_state.mapreduce_spec, shard)
            writer.finalize(cxt, shard)
            # Verify the integrity of writer state.
            self.assertEqual(
                writer._streaming_buffer.name,
                (shard.writer_state[self.WRITER_CLS._SEG_PREFIX] +
                 str(shard.writer_state[self.WRITER_CLS._LAST_SEG_INDEX])))
            final_file = shard.writer_state["filename"]
            self.assertFalse(self.WRITER_CLS._MR_TMP in final_file)
            final_files.add(final_file)

        # Verify all filenames are different.
        self.assertEqual(self.NUM_SHARDS, len(tmp_files))
        self.assertEqual(self.NUM_SHARDS, len(final_files))
Esempio n. 5
0
    def setUp(self):
        """Sets up the test harness."""
        testutil.setup_for_testing()
        self.mapper = offline_jobs.CountSubscribers()
        self.callback = 'http://foo.callback-example.com/my-callback-url'
        self.topic = 'http://example.com/my-topic-url'
        self.token = 'token'
        self.secret = 'my secrat'
        # Do not make these raw strings on purpose, since they will get
        # passed through escaped in the mapreduce.yaml.
        self.topic_pattern = '^http://example\\.com/.*$'
        self.callback_pattern = (
            'http(?:s)?://(?:[^\\.]+\\.)*([^\\./]+\.[^\\./]+)(?:/.*)?')

        class FakeMapper(object):
            params = {
                'topic_pattern': self.topic_pattern,
                'callback_pattern': self.callback_pattern,
            }

        class FakeSpec(object):
            mapreduce_id = '1234'
            mapper = FakeMapper()

        self.context = context.Context(FakeSpec(), None)
        context.Context._set(self.context)
Esempio n. 6
0
    def testRecoverSomethingWrittenInFailedInstance(self):
        mr_spec = self.mr_state.mapreduce_spec
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mr_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mr_spec, 0, 0)
        writer.write("123")
        writer = self.WRITER_CLS.from_json(writer.to_json())
        writer.write("4")

        new_writer = writer._recover(mr_spec, 0, 0)
        # Old instance is finalized and valid offset saved.
        old_stat = cloudstorage.stat(writer._streaming_buffer.name)
        self.assertEqual(len("123"),
                         int(old_stat.metadata[self.WRITER_CLS._VALID_LENGTH]))
        # New instance is created with an incremented seg index.
        self.assertEqual(writer._seg_index + 1, new_writer._seg_index)

        # Verify filenames.
        self.assertTrue(
            writer._streaming_buffer.name.endswith(str(writer._seg_index)))
        self.assertTrue(
            new_writer._streaming_buffer.name.endswith(
                str(new_writer._seg_index)))
 def testGetSetContext(self):
     """Test module's get_context and _set functions."""
     ctx = context.Context(None, None)
     self.assertFalse(context.get())
     context.Context._set(ctx)
     self.assertEquals(ctx, context.get())
     context.Context._set(None)
     self.assertEquals(None, context.get())
Esempio n. 8
0
    def testRemoveGarbage(self):
        """Make sure abandoned files get removed."""
        writer_spec = {
            self.WRITER_CLS.BUCKET_NAME_PARAM: "unused",
            self.WRITER_CLS.TMP_BUCKET_NAME_PARAM: "test"
        }
        mapreduce_state = self.create_mapreduce_state(
            output_params=writer_spec)
        shard_state = self.create_shard_state(1)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer.begin_slice(None)

        # our shard
        our_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-1-very-random"
        f = cloudstorage.open(our_file, "w")
        f.write("foo?")
        f.close()

        # not our shard
        their_file = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-3-very-random"
        f = cloudstorage.open(their_file, "w")
        f.write("bar?")
        f.close()

        # unrelated file
        real_file = "/test/this_things_should_survive"
        f = cloudstorage.open(real_file, "w")
        f.write("yes, foobar!")
        f.close()

        # Make sure bogus file still exists
        names = [l.filename for l in cloudstorage.listbucket("/test")]
        self.assertTrue(our_file in names)
        self.assertTrue(their_file in names)
        self.assertTrue(real_file in names)

        # slice end should clean up the garbage
        writer = self._serialize_and_deserialize(writer)

        names = [l.filename for l in cloudstorage.listbucket("/test")]
        self.assertFalse(our_file in names)
        self.assertTrue(their_file in names)
        self.assertTrue(real_file in names)

        # finalize shouldn't change anything
        writer.finalize(ctx, shard_state)
        self.assertFalse(our_file in names)
        self.assertTrue(their_file in names)
        self.assertTrue(real_file in names)
Esempio n. 9
0
    def testRemovingIgnoredNonExistent(self):
        writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}
        mapreduce_state = self.create_mapreduce_state(
            output_params=writer_spec)
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer._remove_tmpfile(None, writer_spec)  # no exceptions
        writer._remove_tmpfile("/test/i_dont_exist", writer_spec)
Esempio n. 10
0
    def testRecoverNothingWrittenInFailedInstance(self):
        mr_spec = self.mr_state.mapreduce_spec
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mr_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mr_spec, 0, 0)
        self.assertEqual(0, writer._seg_index)
        new_writer = writer._recover(mr_spec, 0, 0)
        # Old instance is not finalized.
        self.assertFalse(0, writer._streaming_buffer.closed)
        # seg index is not incremented.
        self.assertEqual(0, new_writer._seg_index)
  def setupWriter(self):
    """Create an Google Cloud Storage LevelDB record output writer.

    Returns:
      a model.MapreduceSpec.
    """
    self.mapreduce_state = self.create_mapreduce_state()
    self.shard_state = self.create_shard_state(0)
    self.writer = self.WRITER_CLS.create(self.mapreduce_state.mapreduce_spec,
                                         self.shard_state.shard_number,
                                         self.shard_state.retries + 1)
    self.ctx = context.Context(self.mapreduce_state.mapreduce_spec,
                               self.shard_state)
    context.Context._set(self.ctx)
Esempio n. 12
0
  def testMutationPoolSize(self):
    ctx = context.Context(None, None)
    self.assertEquals(context.MAX_ENTITY_COUNT,
                      ctx.mutation_pool.max_entity_count)
    self.assertEquals(context.MAX_POOL_SIZE,
                      ctx.mutation_pool.max_pool_size)

    ctx = context.Context(None, None, task_retry_count=0)
    self.assertEquals(context.MAX_ENTITY_COUNT,
                      ctx.mutation_pool.max_entity_count)
    self.assertEquals(context.MAX_POOL_SIZE,
                      ctx.mutation_pool.max_pool_size)

    ctx = context.Context(None, None, task_retry_count=1)
    self.assertEquals(context.MAX_ENTITY_COUNT / 2,
                      ctx.mutation_pool.max_entity_count)
    self.assertEquals(context.MAX_POOL_SIZE / 2,
                      ctx.mutation_pool.max_pool_size)

    ctx = context.Context(None, None, task_retry_count=4)
    self.assertEquals(context.MAX_ENTITY_COUNT / 16,
                      ctx.mutation_pool.max_entity_count)
    self.assertEquals(context.MAX_POOL_SIZE / 16,
                      ctx.mutation_pool.max_pool_size)
Esempio n. 13
0
    def testFinalizeChecksForErrors(self):
        """Just make sure finalize is never called after processing data."""
        mapreduce_state = self.create_mapreduce_state(
            output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer.begin_slice(None)
        writer.write("foobar")
        # We wrote something, finalize must fail (sanity check).
        self.assertRaises(errors.FailJobError, writer.finalize, ctx,
                          shard_state)
Esempio n. 14
0
    def testSerialization(self):
        mr_spec = self.mr_state.mapreduce_spec
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mr_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mr_spec, 0, 0)
        writer._seg_index = 1
        writer.write("abcde")

        writer = self.WRITER_CLS.from_json_str(writer.to_json_str())
        # _seg_index doesn't change.
        self.assertEqual(1, writer._seg_index)
        # _seg_valid_length is updated to what was in the buffer.
        self.assertEqual(len("abcde"), writer._seg_valid_length)
Esempio n. 15
0
    def testTmpfileName(self):
        writer_spec = {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"}
        mapreduce_state = self.create_mapreduce_state(
            output_params=writer_spec)
        shard_state = self.create_shard_state(19)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer.begin_slice(None)

        prefix = "/test/gae_mr_tmp/DummyMapReduceJobId-tmp-19-"
        tmpfile_name = writer.status.tmpfile.name
        self.assertTrue(tmpfile_name.startswith(prefix),
                        "Test file name is: %s" % tmpfile_name)
Esempio n. 16
0
    def testTmpDefaultsToMain(self):
        writer_spec = {
            self.WRITER_CLS.BUCKET_NAME_PARAM: "bucket",
            self.WRITER_CLS._ACCOUNT_ID_PARAM: "account"
        }
        mapreduce_state = self.create_mapreduce_state(
            output_params=writer_spec)
        shard_state = self.create_shard_state(1)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)

        self.assertEquals("bucket", writer._get_tmp_gcs_bucket(writer_spec))
        self.assertEquals("account", writer._get_tmp_account_id(writer_spec))
  def testWriterCounters(self):
    mapreduce_state = self.create_mapreduce_state(
        output_params=
        {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
    shard_state = self.create_shard_state(0)
    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number, 0)
    ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
    context.Context._set(ctx)

    # Write large amount of data to ensure measurable time passes during write.
    data = "d" * 1024 * 1024 * 10
    writer.write(data)
    self.assertEqual(len(data), shard_state.counters_map.get(
        output_writers.COUNTER_IO_WRITE_BYTES))
    self.assertTrue(shard_state.counters_map.get(
        output_writers.COUNTER_IO_WRITE_MSEC) > 0)
  def testWriter(self):
    mapreduce_state = self.create_mapreduce_state(
        output_params=
        {self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
    shard_state = self.create_shard_state(0)
    ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
    context.Context._set(ctx)

    writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                    shard_state.shard_number, 0)
    data = "fakedata"
    writer.write(data)
    writer.finalize(None, shard_state)
    filename = self.WRITER_CLS._get_filename(shard_state)

    self.assertNotEquals(None, filename)
    self.assertEqual(data, cloudstorage.open(filename).read())
Esempio n. 19
0
    def testCreateWriters(self):
        mapreduce_state = self.create_mapreduce_state(
            output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
        for shard_num in range(self.NUM_SHARDS):
            shard = self.create_shard_state(shard_num)
            writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                            shard.shard_number, 0)
            cxt = context.Context(mapreduce_state.mapreduce_spec, shard)
            shard.result_status = model.ShardState.RESULT_SUCCESS
            writer.finalize(cxt, shard)
            shard.put()
        filenames = self.WRITER_CLS.get_filenames(mapreduce_state)
        # Verify we have the correct number of filenames
        self.assertEqual(self.NUM_SHARDS, len(filenames))

        # Verify each has a unique filename
        self.assertEqual(self.NUM_SHARDS, len(set(filenames)))
Esempio n. 20
0
    def testIncrement(self):
        """Test applying Increment operation."""
        m = mox.Mox()

        ctx = context.Context(None, None)
        ctx.counters = m.CreateMock(context.Counters)

        operation = op.counters.Increment("test", 12)

        # Record calls
        ctx.counters.increment("test", 12)

        m.ReplayAll()
        try:  # test, verify
            operation(ctx)
            m.VerifyAll()
        finally:
            m.UnsetStubs()
Esempio n. 21
0
  def testDelete(self):
    """Test applying Delete operation."""
    m = mox.Mox()

    ctx = context.Context(None, None)
    ctx.mutation_pool = m.CreateMock(context.MutationPool)

    entity = TestEntity()
    operation = op.db.Delete(entity)

    # Record calls
    ctx.mutation_pool.delete(entity)

    m.ReplayAll()
    try:  # test, verify
      operation(ctx)
      m.VerifyAll()
    finally:
      m.UnsetStubs()
    def testArbitraryPool(self):
        """Test arbitrary pool registration."""
        m = mox.Mox()

        ctx = context.Context(None, None)
        self.assertFalse(ctx.get_pool("test"))
        pool = m.CreateMockAnything()
        ctx.register_pool("test", pool)
        self.assertEquals(pool, ctx.get_pool("test"))

        # Record calls
        pool.flush()

        m.ReplayAll()
        try:  # test, verify
            ctx.flush()
            m.VerifyAll()
        finally:
            m.UnsetStubs()
Esempio n. 23
0
    def testWriterMetadata(self):
        test_acl = "test-acl"
        test_content_type = "test-mime"
        mapreduce_state = self.create_mapreduce_state(
            output_params={
                self.WRITER_CLS.BUCKET_NAME_PARAM: "test",
                self.WRITER_CLS.ACL_PARAM: test_acl,
                self.WRITER_CLS.CONTENT_TYPE_PARAM: test_content_type
            })
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer = self.WRITER_CLS.from_json(writer.to_json())
        writer.finalize(ctx, shard_state)

        filename = self.WRITER_CLS._get_filename(shard_state)

        file_stat = cloudstorage.stat(filename)
        self.assertEqual(test_content_type, file_stat.content_type)
Esempio n. 24
0
  def setUp(self):
    """Sets up the test harness."""
    testutil.setup_for_testing()
    self.mapper = offline_jobs.SubscriptionReconfirmMapper()
    self.callback = 'http://example.com/my-callback-url'
    self.topic = 'http://example.com/my-topic-url'
    self.token = 'token'
    self.secret = 'my secrat'

    self.now = datetime.datetime.utcnow()
    self.threshold_seconds = 1000
    self.threshold_timestamp = (
        time.mktime(self.now.utctimetuple()) + self.threshold_seconds)
    self.getnow = lambda: self.now

    class FakeMapper(object):
      params = {'threshold_timestamp': str(self.threshold_timestamp)}
    class FakeSpec(object):
      mapreduce_id = '1234'
      mapper = FakeMapper()
    self.context = context.Context(FakeSpec(), None)
    context.Context._set(self.context)
Esempio n. 25
0
    def testTemporaryFilesGetCleanedUp(self):
        mapreduce_state = self.create_mapreduce_state(
            output_params={self.WRITER_CLS.BUCKET_NAME_PARAM: "test"})
        shard_state = self.create_shard_state(0)
        ctx = context.Context(mapreduce_state.mapreduce_spec, shard_state)
        context.Context._set(ctx)

        writer = self.WRITER_CLS.create(mapreduce_state.mapreduce_spec,
                                        shard_state.shard_number, 0)
        writer.begin_slice(None)
        writer.write("foo")
        writer = self.WRITER_CLS.from_json(writer.to_json())
        writer.write("bar")
        writer = self.WRITER_CLS.from_json(writer.to_json())
        writer.write("foo again")
        writer = self.WRITER_CLS.from_json(writer.to_json())
        writer.finalize(ctx, shard_state)

        names = [l.filename for l in cloudstorage.listbucket("/test")]
        self.assertEquals(
            ["/test/DummyMapReduceJobName/DummyMapReduceJobId/output-0"],
            names)
Esempio n. 26
0
    def post(self):
        """Handle post request."""
        spec = model.MapreduceSpec.from_json_str(
            self.request.get("mapreduce_spec"))
        self._start_time = self._time()
        shard_id = self.shard_id()

        # TODO(user): Make this prettier
        logging.debug("post: shard=%s slice=%s headers=%s", shard_id,
                      self.slice_id(), self.request.META)

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.save()
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = self.input_reader(spec.mapper)

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(cache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        ctx = context.Context(spec, shard_state)
        context.Context._set(ctx)

        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_entity(entity, ctx)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(spec, input_reader)
Esempio n. 27
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_data(
                            entity, input_reader, ctx, tstate)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            operation.counters.Increment(
                "mapper-walltime-msec",
                int((time.time() - self._start_time) * 1000))(ctx)

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()

            if not shard_state.active:
                # shard is going to stop. Finalize output writer if any.
                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)
            shard_state.put(config=util.create_datastore_write_config(spec))
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(shard_state, tstate)
        gc.collect()
Esempio n. 28
0
    def _start_map(cls,
                   name,
                   mapper_spec,
                   mapreduce_params,
                   base_path=None,
                   queue_name=None,
                   eta=None,
                   countdown=None,
                   hooks_class_name=None,
                   _app=None,
                   transactional=False,
                   parent_entity=None):
        queue_name = queue_name or os.environ.get("HTTP_X_APPENGINE_QUEUENAME",
                                                  "default")
        if queue_name[0] == "_":
            # We are currently in some special queue. E.g. __cron.
            queue_name = "default"

        if not transactional and parent_entity:
            raise Exception("Parent shouldn't be specfied "
                            "for non-transactional starts.")

        # Check that reader can be instantiated and is configured correctly
        mapper_input_reader_class = mapper_spec.input_reader_class()
        mapper_input_reader_class.validate(mapper_spec)

        mapper_output_writer_class = mapper_spec.output_writer_class()
        if mapper_output_writer_class:
            mapper_output_writer_class.validate(mapper_spec)

        mapreduce_id = model.MapreduceState.new_mapreduce_id()
        mapreduce_spec = model.MapreduceSpec(name, mapreduce_id,
                                             mapper_spec.to_json(),
                                             mapreduce_params,
                                             hooks_class_name)

        # Check that handler can be instantiated.
        ctx = context.Context(mapreduce_spec, None)
        context.Context._set(ctx)
        try:
            mapper_spec.get_handler()
        finally:
            context.Context._set(None)

        kickoff_params = {"mapreduce_spec": mapreduce_spec.to_json_str()}
        if _app:
            kickoff_params["app"] = _app
        kickoff_worker_task = util.HugeTask(url=base_path +
                                            "/kickoffjob_callback",
                                            params=kickoff_params,
                                            eta=eta,
                                            countdown=countdown)

        hooks = mapreduce_spec.get_hooks()
        config = util.create_datastore_write_config(mapreduce_spec)

        def start_mapreduce():
            parent = parent_entity
            if not transactional:
                # Save state in datastore so that UI can see it.
                # We can't save state in foreign transaction, but conventional UI
                # doesn't ask for transactional starts anyway.
                state = model.MapreduceState.create_new(
                    mapreduce_spec.mapreduce_id)
                state.mapreduce_spec = mapreduce_spec
                state.active = True
                state.active_shards = mapper_spec.shard_count
                if _app:
                    state.app_id = _app
                state.put(config=config)
                parent = state

            if hooks is not None:
                try:
                    hooks.enqueue_kickoff_task(kickoff_worker_task, queue_name)
                except NotImplementedError:
                    # Use the default task addition implementation.
                    pass
                else:
                    return
            kickoff_worker_task.add(queue_name,
                                    transactional=True,
                                    parent=parent)

        if transactional:
            start_mapreduce()
        else:
            db.run_in_transaction(start_mapreduce)

        return mapreduce_id
Esempio n. 29
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard is not active. Looks like spurious task execution.")
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            # NOTE: When aborting, specifically do not finalize the output writer
            # because it might be in a bad state.
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        # Tell NDB to never cache anything in memcache or in-process. This ensures
        # that entities fetched from Datastore input_readers via NDB will not bloat
        # up the request memory size and Datastore Puts will avoid doing calls
        # to memcache. Without this you get soft memory limit exits, which hurts
        # overall throughput.
        if ndb is not None:
            ndb_ctx = ndb.get_context()
            ndb_ctx.set_cache_policy(lambda key: False)
            ndb_ctx.set_memcache_policy(lambda key: False)

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                try:
                    # We shouldn't fetch an entity from the reader if there's not enough
                    # quota to process it. Perform all quota checks proactively.
                    if not quota_consumer or quota_consumer.consume():
                        for entity in input_reader:
                            if isinstance(entity, db.Model):
                                shard_state.last_work_item = repr(entity.key())
                            else:
                                shard_state.last_work_item = repr(entity)[:100]

                            scan_aborted = not self.process_data(
                                entity, input_reader, ctx, tstate)

                            # Check if we've got enough quota for the next entity.
                            if (quota_consumer and not scan_aborted
                                    and not quota_consumer.consume()):
                                scan_aborted = True
                            if scan_aborted:
                                break
                    else:
                        scan_aborted = True

                    if not scan_aborted:
                        logging.info(
                            "Processing done for shard %d of job '%s'",
                            shard_state.shard_number, shard_state.mapreduce_id)
                        # We consumed extra quota item at the end of for loop.
                        # Just be nice here and give it back :)
                        if quota_consumer:
                            quota_consumer.put(1)
                        shard_state.active = False
                        shard_state.result_status = model.ShardState.RESULT_SUCCESS

                    operation.counters.Increment(
                        context.COUNTER_MAPPER_WALLTIME_MS,
                        int((time.time() - self._start_time) * 1000))(ctx)

                    # TODO(user): Mike said we don't want this happen in case of
                    # exception while scanning. Figure out when it's appropriate to skip.
                    ctx.flush()
                except errors.RetrySliceError, e:
                    logging.error("Slice error: %s", e)
                    retry_count = int(
                        os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0)
                    if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES:
                        raise
                    logging.error("Too many retries: %d, failing the job",
                                  retry_count)
                    scan_aborted = True
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_FAILED
                except errors.FailJobError, e:
                    logging.error("Job failed: %s", e)
                    scan_aborted = True
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_FAILED
Esempio n. 30
0
    def handle(self):
        """Handle request."""
        tstate = model.TransientShardState.from_request(self.request)
        spec = tstate.mapreduce_spec
        self._start_time = self._time()
        shard_id = tstate.shard_id

        shard_state, control = db.get([
            model.ShardState.get_key_by_shard_id(shard_id),
            model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
        ])
        if not shard_state:
            # We're letting this task to die. It's up to controller code to
            # reinitialize and restart the task.
            logging.error("State not found for shard ID %r; shutting down",
                          shard_id)
            return

        if not shard_state.active:
            logging.error(
                "Shard is not active. Looks like spurious task execution.")
            return

        ctx = context.Context(spec,
                              shard_state,
                              task_retry_count=self.task_retry_count())

        if control and control.command == model.MapreduceControl.ABORT:
            logging.info("Abort command received by shard %d of job '%s'",
                         shard_state.shard_number, shard_state.mapreduce_id)
            if tstate.output_writer:
                tstate.output_writer.finalize(ctx, shard_state.shard_number)
            # We recieved a command to abort. We don't care if we override
            # some data.
            shard_state.active = False
            shard_state.result_status = model.ShardState.RESULT_ABORTED
            shard_state.put(config=util.create_datastore_write_config(spec))
            model.MapreduceControl.abort(spec.mapreduce_id)
            return

        input_reader = tstate.input_reader

        if spec.mapper.params.get("enable_quota", True):
            quota_consumer = quota.QuotaConsumer(
                quota.QuotaManager(memcache.Client()), shard_id,
                _QUOTA_BATCH_SIZE)
        else:
            quota_consumer = None

        context.Context._set(ctx)
        try:
            # consume quota ahead, because we do not want to run a datastore
            # query if there's not enough quota for the shard.
            if not quota_consumer or quota_consumer.check():
                scan_aborted = False
                entity = None

                # We shouldn't fetch an entity from the reader if there's not enough
                # quota to process it. Perform all quota checks proactively.
                if not quota_consumer or quota_consumer.consume():
                    for entity in input_reader:
                        if isinstance(entity, db.Model):
                            shard_state.last_work_item = repr(entity.key())
                        else:
                            shard_state.last_work_item = repr(entity)[:100]

                        scan_aborted = not self.process_data(
                            entity, input_reader, ctx, tstate)

                        # Check if we've got enough quota for the next entity.
                        if (quota_consumer and not scan_aborted
                                and not quota_consumer.consume()):
                            scan_aborted = True
                        if scan_aborted:
                            break
                else:
                    scan_aborted = True

                if not scan_aborted:
                    logging.info("Processing done for shard %d of job '%s'",
                                 shard_state.shard_number,
                                 shard_state.mapreduce_id)
                    # We consumed extra quota item at the end of for loop.
                    # Just be nice here and give it back :)
                    if quota_consumer:
                        quota_consumer.put(1)
                    shard_state.active = False
                    shard_state.result_status = model.ShardState.RESULT_SUCCESS

            operation.counters.Increment(
                context.COUNTER_MAPPER_WALLTIME_MS,
                int((time.time() - self._start_time) * 1000))(ctx)

            # TODO(user): Mike said we don't want this happen in case of
            # exception while scanning. Figure out when it's appropriate to skip.
            ctx.flush()

            if not shard_state.active:
                # shard is going to stop. Finalize output writer if any.
                if tstate.output_writer:
                    tstate.output_writer.finalize(ctx,
                                                  shard_state.shard_number)

            config = util.create_datastore_write_config(spec)
            # We don't want shard state to override active state, since that
            # may stuck job execution (see issue 116). Do a transactional
            # verification for status.
            # TODO(user): this might still result in some data inconsistency
            # which can be avoided. It doesn't seem to be worth it now, because
            # various crashes might result in all sort of data consistencies
            # anyway.
            @db.transactional(retries=5)
            def tx():
                fresh_shard_state = db.get(
                    model.ShardState.get_key_by_shard_id(shard_id))
                if (not fresh_shard_state.active
                        or "worker_active_state_collision"
                        in _TEST_INJECTED_FAULTS):
                    shard_state.active = False
                    logging.error(
                        "Spurious task execution. Aborting the shard.")
                    return
                fresh_shard_state.copy_from(shard_state)
                fresh_shard_state.put(config=config)

            tx()
        finally:
            context.Context._set(None)
            if quota_consumer:
                quota_consumer.dispose()

        # Rescheduling work should always be the last statement. It shouldn't happen
        # if there were any exceptions in code before it.
        if shard_state.active:
            self.reschedule(shard_state, tstate)
        gc.collect()