Ejemplo n.º 1
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Ejemplo n.º 3
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
Ejemplo n.º 6
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
Ejemplo n.º 7
0
    def testNoCombiner(self):
        """Test running with low values count but without combiner."""
        # Even though this test doesn't have combiner specified, it's still
        # interesting to run. It forces MergePipeline to produce partial
        # key values and we verify that they are combined correctly in reader.

        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter",
            mapper_params={"entity_kind": __name__ + ".TestEntity"},
            shards=4,
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(1, len(p.outputs.default.value))
        output_file = p.outputs.default.value[0]

        file_content = []
        with files.open(output_file, "r") as f:
            file_content = sorted(f.read(10000000).strip().split("\n"))

        self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)
Ejemplo n.º 8
0
    def testLotsOfValuesForSingleKey(self):
        TestEntity(data=str(1)).put()
        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".map_yield_lots_of_values",
            __name__ + ".reduce_length",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreRecordsOutputWriter",
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        output_data = []
        for output_file in p.outputs.default.value:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = ["('1', 50000)"]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)
  def testHugeTaskUseDatastore(self):
    """Test map job with huge parameter values."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file,
            # the parameter can't be compressed and wouldn't fit into
            # taskqueue payload
            "huge_parameter": random_string(900000)
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
    self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
Ejemplo n.º 10
0
  def testRecordsReader(self):
    """End-to-end test for records reader."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
Ejemplo n.º 11
0
  def flush(self):
    """Flush pool contents."""
    # Write data to in-memory buffer first.
    buf = _StringWriter()
    with records.RecordsWriter(buf) as w:
      for record in self._buffer:
        w.write(record)

    str_buf = buf.to_string()
    if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE:
      # Shouldn't really happen because of flush size.
      raise errors.Error(
          "Buffer too big. Can't write more than %s bytes in one request: "
          "risk of writes interleaving. Got: %s" %
          (_FILES_API_MAX_SIZE, len(str_buf)))

    # Write data to file.
    start_time = time.time()
    with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f:
      f.write(str_buf)
      if self._ctx:
        operation.counters.Increment(
            COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx)
    if self._ctx:
      operation.counters.Increment(
          COUNTER_IO_WRITE_MSEC,
          int((time.time() - start_time) * 1000))(self._ctx)

    # reset buffer
    self._buffer = []
    self._size = 0
    gc.collect()
Ejemplo n.º 12
0
  def testLotsOfValuesForSingleKey(self):
    TestEntity(data=str(1)).put()
    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".map_yield_lots_of_values",
        __name__ + ".reduce_length",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
        mapper_params= {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    output_data = []
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = ["('1', 50000)"]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testDedicatedParams(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            "output_writer": {
                "filesystem": "gs",
                "gs_bucket_name": "bucket",
            },
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=FILE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state)
    self.assertEqual(1, len(filenames))
    self.assertTrue(filenames[0].startswith("/gs/bucket/"))

    with files.open(filenames[0], "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
Ejemplo n.º 16
0
 def testAppendAndFlush(self):
     self.pool.append("a")
     self.assertEquals("", self.file_service.get_content("tempfile"))
     self.pool.append("b")
     self.assertEquals("", self.file_service.get_content("tempfile"))
     self.pool.flush()
     self.assertEquals(["a", "b"], list(records.RecordsReader(files.open("tempfile", "r"))))
Ejemplo n.º 17
0
  def flush(self):
    """Flush pool contents."""
    # Write data to in-memory buffer first.
    buf = _StringWriter()
    with records.RecordsWriter(buf) as w:
      for record in self._buffer:
        w.write(record)

    str_buf = buf.to_string()
    if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE:
      # Shouldn't really happen because of flush size.
      raise errors.Error(
          "Buffer too big. Can't write more than %s bytes in one request: "
          "risk of writes interleaving. Got: %s" %
          (_FILES_API_MAX_SIZE, len(str_buf)))

    # Write data to file.
    start_time = time.time()
    with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f:
      f.write(str_buf)
      if self._ctx:
        operation.counters.Increment(
            COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx)
    if self._ctx:
      operation.counters.Increment(
          COUNTER_IO_WRITE_MSEC,
          int((time.time() - start_time) * 1000))(self._ctx)

    # reset buffer
    self._buffer = []
    self._size = 0
    gc.collect()
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader",
            {
                "file": input_file,
                # the parameter can't be compressed and wouldn't fit into
                # taskqueue payload
                "huge_parameter": random_string(900000)
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
  def testMultipleRequests(self):
    """Tests restoring the reader state across multiple requests."""
    input_file = files.blobstore.create()

    # Create a file with two records.
    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())

        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    # Now read the records in two attempts, serializing and recreating the
    # input reader as if it's a separate request.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    it = iter(reader)
    self.assertEquals(input_readers.ALLOW_CHECKPOINT, it.next())

    reader_state = reader.to_json()
    other_reader = mapreduce_pipeline._ReducerReader.from_json(reader_state)
    it = iter(reader)
    self.assertEquals(("key2", ["a", "b", "c", "d"]), it.next())
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [("1", "a"), ("2", "b"), ("3", "c")]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ("1", ["a"], True),
                ("1", ["a"], True),
                ("1", ["a"], False),
                ("2", ["b"], True),
                ("2", ["b"], True),
                ("2", ["b"], False),
                ("3", ["c"], True),
                ("3", ["c"], True),
                ("3", ["c"], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
Ejemplo n.º 21
0
 def flush(self):
     """Flush pool contents."""
     for filename, data in self._append_buffer.iteritems():
         with files.open(filename, 'a') as f:
             if len(data) > self._max_size:
                 raise "Bad data: " + str(len(data))
             f.write(data)
     self._append_buffer = {}
     self._size = 0
Ejemplo n.º 22
0
 def createMockDataLine(self, data):
     file_name = "myblob_01"
     file_path = files.blobstore.create("text/plain", file_name)
     with files.open(file_path, 'a') as fp:
         fp.write(data)
     files.finalize(file_path)
     blob_key = files.blobstore.get_blob_key(file_path)
     file_name = files.blobstore.get_file_name(blob_key)
     return file_name
Ejemplo n.º 23
0
 def createMockDataLine(self, data):
   file_name = "myblob_01"
   file_path = files.blobstore.create("text/plain", file_name)
   with files.open(file_path, 'a') as fp:
     fp.write(data)
   files.finalize(file_path)
   blob_key = files.blobstore.get_blob_key(file_path)
   file_name = files.blobstore.get_file_name(blob_key)
   return file_name
Ejemplo n.º 24
0
 def flush(self):
   """Flush pool contents."""
   for filename, data in self._append_buffer.iteritems():
     with files.open(filename, 'a') as f:
       if len(data) > self._max_size:
         raise "Bad data: " + str(len(data))
       f.write(data)
   self._append_buffer = {}
   self._size = 0
 def testAppendAndFlush(self):
     self.pool.append("a")
     self.assertEquals("", self.file_service.get_content("tempfile"))
     self.pool.append("b")
     self.assertEquals("", self.file_service.get_content("tempfile"))
     self.pool.flush()
     self.assertEquals(
         ["a", "b"], list(records.RecordsReader(files.open("tempfile",
                                                           "r"))))
    def testCleanup_ListOfLists(self):
        """Tests cleaning up a list of file lists."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run map
        p = mapper_pipeline.MapperPipeline(
            "test",
            handler_spec=__name__ + ".test_map",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".KeyValueBlobstoreOutputWriter",
            params={
                "entity_kind": __name__ + ".TestEntity",
            },
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        for name in file_list:
            files.open(name, "r").read(0)

        grouped_list = [file_list]

        # Cleanup
        cleanup = mapper_pipeline._CleanupPipeline(grouped_list)
        cleanup.start()
        test_support.execute_until_empty(self.taskqueue)

        # Cannot open files
        for name in file_list:
            self.assertRaises(files.Error, files.open, name, "r")
Ejemplo n.º 27
0
  def testCleanup_ListOfLists(self):
    """Tests cleaning up a list of file lists."""
    # Prepare test data
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run map
    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".KeyValueBlobstoreOutputWriter",
        params={
            "entity_kind": __name__ + ".TestEntity",
            },
        )
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)

    # Can open files
    file_list = finished_map.outputs.default.value
    self.assertTrue(len(file_list) > 0)
    for name in file_list:
      files.open(name, "r").read(0)

    grouped_list = [file_list]

    # Cleanup
    cleanup = mapper_pipeline._CleanupPipeline(grouped_list)
    cleanup.start()
    test_support.execute_until_empty(self.taskqueue)

    # Cannot open files
    for name in file_list:
      self.assertRaises(files.Error, files.open, name, "r")
  def testMapReduce(self):
    # Prepare test data
    word_count = 2

    file_path = "/gs/foo/bar"
    self.createGSData(file_path, "foo bar foo bar foo bar foo")
    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".test_map",
        "googlestorage.shuffler.ShufflePipeline",
        __name__ + ".test_reduce",
        input_reader_spec="googlestorage.input_readers.GoogleStorageLineInputReader",
        output_writer_spec=
            "googlestorage.output_writers.GoogleStorageOutputWriter",
        mapper_params={"file_paths": file_path,
            "gs_bucket_name": "temp_test",
            "gs_acl": "public-read"},
        shuffler_params={"gs_bucket_name": "temp_test", 
                         "mime_type": "text/plain",
                         "gs_acl": "public-read"},
        reducer_params={"gs_bucket_name": "output_test",
                         "mime_type": "text/plain",
                         "gs_acl": "public-read",},
        shards=2)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as fp:
        buf = fp.read(1000000)
    filestream = StringIO(buf)
    output_data = filestream.read()
    outputList = output_data[:-1].split('\n')
    
    expected_data = ["foo: 4", "bar: 3"]
    expected_data.sort()
    outputList.sort()
    self.assertEquals(expected_data, outputList)

    # Verify that mapreduce doesn't leave intermediate files behind.
    blobInfos = blobstore.BlobInfo.all().fetch(limit=1000)
    for blobinfo in blobInfos:
      self.assertTrue(
          "Bad filename: %s" % blobinfo.filename,
          re.match("test-reduce-.*-output-\d+", blobinfo.filename))
Ejemplo n.º 29
0
    def createInvalidMockData(self):
        blob_keys = []
        url = "invalidScheme://test_url.com"
        file_path = files.blobstore.create("text/plain", url)
        with files.open(file_path, 'a') as fp:
            fp.write(url)
        files.finalize(file_path)
        blob_key = files.blobstore.get_blob_key(file_path)
        file_name = files.blobstore.get_file_name(blob_key)
        blob_keys.append(str(file_name))

        return blob_keys
Ejemplo n.º 30
0
  def createInvalidMockData(self):
    blob_keys = []
    url = "invalidScheme://test_url.com"
    file_path = files.blobstore.create("text/plain", url)
    with files.open(file_path, 'a') as fp:
      fp.write(url)
    files.finalize(file_path)
    blob_key = files.blobstore.get_blob_key(file_path)
    file_name = files.blobstore.get_file_name(blob_key)
    blob_keys.append(str(file_name))

    return blob_keys
 def createGSData(self, file_count, data):
   file_paths = []
   for file_number in range(file_count):
     file_path = "/gs/foo/bar%d" % file_number
     write_path = files.gs.create(file_path, mime_type='text/plain', 
                                                 acl='public-read')
     with files.open(write_path, 'a') as fp:
       fp.write(data)
     files.finalize(write_path)
     
     file_paths.append(file_path)
   return file_paths
Ejemplo n.º 32
0
 def createMockData(self, url_count, shard):
   blob_keys = []
   for num in range(shard):
     file_name = "myblob_%d" % num
     urls = "\n".join(["http://test_url_%d.com" % i for i in range(url_count)])
     file_path = files.blobstore.create("text/plain", file_name)
     with files.open(file_path, 'a') as fp:
       fp.write(urls)
     files.finalize(file_path)
     blob_key = files.blobstore.get_blob_key(file_path)
     file_name = files.blobstore.get_file_name(blob_key)
     blob_keys.append(str(file_name))
   return blob_keys
Ejemplo n.º 33
0
 def createMockData(self, url_count, shard):
     blob_keys = []
     for num in range(shard):
         file_name = "myblob_%d" % num
         urls = "\n".join(
             ["http://test_url_%d.com" % i for i in range(url_count)])
         file_path = files.blobstore.create("text/plain", file_name)
         with files.open(file_path, 'a') as fp:
             fp.write(urls)
         files.finalize(file_path)
         blob_key = files.blobstore.get_blob_key(file_path)
         file_name = files.blobstore.get_file_name(blob_key)
         blob_keys.append(str(file_name))
     return blob_keys
Ejemplo n.º 34
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Ejemplo n.º 35
0
  def testMapReduce(self):
    # Prepare test data
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".test_map",
        __name__ + ".test_reduce",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
        mapper_params={
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    output_data = []
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = [
        str((str(d), ["", ""])) for d in range(entity_count)]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)

    # Verify that mapreduce doesn't leave intermediate files behind.
    blobInfos = blobstore.BlobInfo.all().fetch(limit=1000)
    for blobinfo in blobInfos:
      self.assertTrue(
          "Bad filename: %s" % blobinfo.filename,
          re.match("test-reduce-.*-output-\d+", blobinfo.filename))
  def testMapReduce(self):
    # Prepare test data
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".test_map",
        __name__ + ".test_reduce",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
        mapper_params={
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    output_data = []
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = [
        str((str(d), ["", ""])) for d in range(entity_count)]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)

    # Verify that mapreduce doesn't leave intermediate files behind.
    blobInfos = blobstore.BlobInfo.all().fetch(limit=1000)
    for blobinfo in blobInfos:
      self.assertTrue(
          "Bad filename: %s" % blobinfo.filename,
          re.match("test-reduce-.*-output-\d+", blobinfo.filename))
Ejemplo n.º 37
0
 def flush(self):
   """Flush pool contents."""
   start_time = time.time()
   for filename, data in self._append_buffer.iteritems():
     with files.open(filename, "a") as f:
       if len(data) > self._flush_size:
         raise errors.Error("Bad data: %s" % len(data))
       if self._ctx:
         operation.counters.Increment(
             COUNTER_IO_WRITE_BYTES, len(data))(self._ctx)
       f.write(data)
   if self._ctx:
     operation.counters.Increment(
         COUNTER_IO_WRITE_MSEC,
         int((time.time() - start_time) * 1000))(self._ctx)
   self._append_buffer = {}
   self._size = 0
Ejemplo n.º 38
0
  def createMockData(self, data):
    """Create mock data for FetchContentPipeline"""
    input_file = files.blobstore.create()
    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        key = str(data[0])
        value = str(data[1])
        proto = file_service_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))
    
    return input_file
Ejemplo n.º 39
0
 def flush(self):
   """Flush pool contents."""
   start_time = time.time()
   for filename, data in self._append_buffer.iteritems():
     with files.open(filename, "a") as f:
       if len(data) > self._flush_size:
         raise "Bad data: " + str(len(data))
       if self._ctx:
         operation.counters.Increment(
             COUNTER_IO_WRITE_BYTES, len(data))(self._ctx)
       f.write(data)
   if self._ctx:
     operation.counters.Increment(
         COUNTER_IO_WRITE_MSEC,
         int((time.time() - start_time) * 1000))(self._ctx)
   self._append_buffer = {}
   self._size = 0
Ejemplo n.º 40
0
    def createMockData(self, data):
        """Create mock data for FetchContentPipeline"""
        input_file = files.blobstore.create()
        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                key = str(data[0])
                value = str(data[1])
                proto = file_service_pb.KeyValue()
                proto.set_key(key)
                proto.set_value(value)
                w.write(proto.Encode())

        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        return input_file
Ejemplo n.º 41
0
 def flush(self):
     """Flush pool contents."""
     start_time = time.time()
     for filename, data in self._append_buffer.iteritems():
         with files.open(filename, "a") as f:
             if len(data) > _FILES_API_MAX_SIZE:
                 raise errors.Error("Bad data of length: %s" % len(data))
             if self._ctx:
                 operation.counters.Increment(COUNTER_IO_WRITE_BYTES,
                                              len(data))(self._ctx)
             f.write(data)
     if self._ctx:
         operation.counters.Increment(
             COUNTER_IO_WRITE_MSEC, int(
                 (time.time() - start_time) * 1000))(self._ctx)
     self._append_buffer = {}
     self._size = 0
    def testCombiner(self):
        """Test running with low values count but with combiner."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            combiner_spec=__name__ + ".TestCombiner",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreOutputWriter",
            mapper_params={
                "entity_kind": __name__ + ".TestEntity",
            },
            shards=4)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(1, len(p.outputs.default.value))
        output_file = p.outputs.default.value[0]

        file_content = []
        with files.open(output_file, "r") as f:
            file_content = sorted(f.read(10000000).strip().split("\n"))
        self.assertEquals(
            ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"],
            file_content)

        self.assertTrue(TestCombiner.invocations)

        for invocation in TestCombiner.invocations:
            key = invocation[0]
            values = invocation[1]
            combiner_values = invocation[2]
            self.assertTrue(key)
            self.assertTrue(values)
            self.assertEquals(1, len(values))
            self.assertTrue(int(values[0]) % 4 == int(key))
Ejemplo n.º 43
0
    def testCombiner(self):
        """Test running with low values count but with combiner."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            combiner_spec=__name__ + ".TestCombiner",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter",
            mapper_params={"entity_kind": __name__ + ".TestEntity"},
            shards=4,
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(1, len(p.outputs.default.value))
        output_file = p.outputs.default.value[0]

        file_content = []
        with files.open(output_file, "r") as f:
            file_content = sorted(f.read(10000000).strip().split("\n"))
        self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)

        self.assertTrue(TestCombiner.invocations)

        for invocation in TestCombiner.invocations:
            key = invocation[0]
            values = invocation[1]
            combiner_values = invocation[2]
            self.assertTrue(key)
            self.assertTrue(values)
            self.assertEquals(1, len(values))
            self.assertTrue(int(values[0]) % 4 == int(key))
Ejemplo n.º 44
0
 def requestCrawlerOutput(self):
     
     
     logging.debug("Request Crawler Output")
     
     file_content = 'DEFAULT_CONTENT'
     
     url = "http://web.ist.utl.pt/ist163512/crawler.txt"
     result = urllib2.urlopen(url)
     file_content = result.read()
     
     file_name = files.blobstore.create(mime_type='application/octet-stream')
     self.crawler_file_name = file_name
     
     with files.open(file_name, 'a') as f:
         f.write("%s" % file_content)
         
     files.finalize(file_name)
     logging.debug("File saved successfully")
     key = files.blobstore.get_blob_key(file_name)
     return key
  def testMultipleShards(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_sharding": "input",
            "filesystem": "gs",
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(4, len(filenames))

    file_lengths = []
    for filename in filenames:
      self.assertTrue(filename.startswith("/blobstore/"))
      self.assertFalse(filename.startswith("/blobstore/writable:"))

      with files.open(filename, "r") as f:
        data = f.read(10000000)
        file_lengths.append(len(data.strip().split("\n")))

    # these numbers are totally random and depend on our sharding,
    # which is quite deterministic.
    expected_lengths = [199, 210, 275, 316]
    self.assertEqual(1000, sum(expected_lengths))
    self.assertEquals(expected_lengths, file_lengths)
  def testMultipleShards(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_sharding": "input",
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(4, len(filenames))

    file_lengths = []
    for filename in filenames:
      self.assertTrue(filename.startswith("/blobstore/"))
      self.assertFalse(filename.startswith("/blobstore/writable:"))

      with files.open(filename, "r") as f:
        data = f.read(10000000)
        file_lengths.append(len(data.strip().split("\n")))

    # these numbers are totally random and depend on our sharding,
    # which is quite deterministic.
    expected_lengths = [199, 210, 275, 316]
    self.assertEqual(1000, sum(expected_lengths))
    self.assertEquals(expected_lengths, file_lengths)
    def testRecordsReader(self):
        """End-to-end test for records reader."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader", {"file": input_file},
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
Ejemplo n.º 48
0
  def get(self):

    filekey = self.request.get("filekey")
    
    #key = ManageCrawlerOutput.requestCrawlerOutput(self)
#    str_key = str(key)

    self.response.headers['Content-Type'] = 'text/plain'
    
    query = db.GqlQuery("SELECT * FROM WebSiteInfo")
    
    file_name = files.blobstore.create(mime_type='application/octet-stream')

    with files.open(file_name, 'a') as f:              
        for q in query:
            w = WebSiteInfo()
            w=q
            title = str(w.title)
            title = re.sub(r" ", "", title)
            #title.replace(" ","-")
            self.response.out.write("parsed title %s\n" % title)
            f.write("%s %s\n" % (title,w.siteLinks))

    files.finalize(file_name)
    logging.debug("File saved successfully")
    
    key = files.blobstore.get_blob_key(file_name)

#    info = blobstore.get(key)
#    reader = info.open()
#    file_content = reader.read(501900)
#    self.response.out.write("\n\n")
#    self.response.out.write("%s" % file_content)


    pipeline = PageRankPipeline(filekey, str(key))      
    pipeline.start()
    
    self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
    def testNoCombiner(self):
        """Test running with low values count but without combiner."""
        # Even though this test doesn't have combiner specified, it's still
        # interesting to run. It forces MergePipeline to produce partial
        # key values and we verify that they are combined correctly in reader.

        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreOutputWriter",
            mapper_params={
                "entity_kind": __name__ + ".TestEntity",
            },
            shards=4)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(1, len(p.outputs.default.value))
        output_file = p.outputs.default.value[0]

        file_content = []
        with files.open(output_file, "r") as f:
            file_content = sorted(f.read(10000000).strip().split("\n"))

        self.assertEquals(
            ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"],
            file_content)
  def next(self):
    """Returns the next input from as an (offset, line) tuple."""
    self._has_iterated = True

    if not self._filestream:
      with files.open(self._file_path, 'r') as fp:
        value = fp.read()
      self._filestream = StringIO(value)
      if self._start_position:
        self._filestream.seek(self._start_position)
        self._filestream.readline()
    
    start_position = self._filestream.tell()

    if start_position > self._end_position:
      self.stopIteration()
      
    line = self._filestream.readline()
    
    if not line:
      self.stopIteration()
      
    return start_position, line.rstrip("\n")
  def testSingleRequest(self):
    """Tests when a key can be handled during a single request."""
    input_file = files.blobstore.create()

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        # First record is full
        proto = file_service_pb.KeyValues()
        proto.set_key("key1")
        proto.value_list().extend(["a", "b"])
        w.write(proto.Encode())
        # Second record is partial
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    self.assertEquals(
        [("key1", ["a", "b"]),
         input_readers.ALLOW_CHECKPOINT,
         ("key2", ["a", "b", "c", "d"])],
        list(reader))

    # now test state serialization
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    self.assertEquals(
        {"position": 0,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    self.assertEquals(
        {"position": 19,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    self.assertEquals(
        {"position": 40,
         "current_values": "KGxwMApTJ2EnCnAxCmFTJ2InCnAyCmEu",
         "current_key": "UydrZXkyJwpwMAou",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    self.assertEquals(
        {"position": 59,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass

    # now do test deserialization at every moment.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass
Ejemplo n.º 52
0
  def testReadPartial(self):
    input_file = files.blobstore.create()

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        # First record is full
        proto = file_service_pb.KeyValues()
        proto.set_key("key1")
        proto.value_list().extend(["a", "b"])
        w.write(proto.Encode())
        # Second record is partial
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    self.assertEquals(
        [("key1", ["a", "b"]),
         input_readers.ALLOW_CHECKPOINT,
         ("key2", ["a", "b", "c", "d"])],
        list(reader))

    # now test state serialization
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    self.assertEquals(
        {"position": 0,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    self.assertEquals(
        {"position": 19,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    self.assertEquals(
        {"position": 40,
         "current_values": ["a", "b"],
         "current_key": "key2",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    self.assertEquals(
        {"position": 59,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass

    # now do test deserialization at every moment.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass
 def createGSData(self, file_path, data):
   write_path = files.gs.create(file_path, mime_type='text/plain', 
                                                 acl='public-read')
   with files.open(write_path, 'a') as fp:
     fp.write(data)
   files.finalize(write_path)