def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader",
            {
                "file": input_file,
                # the parameter can't be compressed and wouldn't fit into
                # taskqueue payload
                "huge_parameter": random_string(900000)
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
  def testRecordsReader(self):
    """End-to-end test for records reader."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
Ejemplo n.º 3
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
  def testMultipleRequests(self):
    """Tests restoring the reader state across multiple requests."""
    input_file = files.blobstore.create()

    # Create a file with two records.
    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())

        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    # Now read the records in two attempts, serializing and recreating the
    # input reader as if it's a separate request.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    it = iter(reader)
    self.assertEquals(input_readers.ALLOW_CHECKPOINT, it.next())

    reader_state = reader.to_json()
    other_reader = mapreduce_pipeline._ReducerReader.from_json(reader_state)
    it = iter(reader)
    self.assertEquals(("key2", ["a", "b", "c", "d"]), it.next())
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Ejemplo n.º 8
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
  def testHugeTaskUseDatastore(self):
    """Test map job with huge parameter values."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file,
            # the parameter can't be compressed and wouldn't fit into
            # taskqueue payload
            "huge_parameter": random_string(900000)
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
    self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
Ejemplo n.º 10
0
 def createMockDataLine(self, data):
   file_name = "myblob_01"
   file_path = files.blobstore.create("text/plain", file_name)
   with files.open(file_path, 'a') as fp:
     fp.write(data)
   files.finalize(file_path)
   blob_key = files.blobstore.get_blob_key(file_path)
   file_name = files.blobstore.get_file_name(blob_key)
   return file_name
Ejemplo n.º 11
0
 def createMockDataLine(self, data):
     file_name = "myblob_01"
     file_path = files.blobstore.create("text/plain", file_name)
     with files.open(file_path, 'a') as fp:
         fp.write(data)
     files.finalize(file_path)
     blob_key = files.blobstore.get_blob_key(file_path)
     file_name = files.blobstore.get_file_name(blob_key)
     return file_name
Ejemplo n.º 12
0
    def finalize(self, ctx, shard_number):
        """Finalize writer shard-level state.

        Args:
          ctx: an instance of context.Context.
          shard_number: shard number as integer.
        """
        finalized_filenames = []
        for filename in self._filenames:
            files.finalize(filename)
Ejemplo n.º 13
0
    def finalize(self, ctx, shard_number):
        """Finalize writer shard-level state.

    Args:
      ctx: an instance of context.Context.
      shard_number: shard number as integer.
    """
        finalized_filenames = []
        for filename in self._filenames:
            files.finalize(filename)
 def createGSData(self, file_count, data):
   file_paths = []
   for file_number in range(file_count):
     file_path = "/gs/foo/bar%d" % file_number
     write_path = files.gs.create(file_path, mime_type='text/plain', 
                                                 acl='public-read')
     with files.open(write_path, 'a') as fp:
       fp.write(data)
     files.finalize(write_path)
     
     file_paths.append(file_path)
   return file_paths
Ejemplo n.º 15
0
    def createInvalidMockData(self):
        blob_keys = []
        url = "invalidScheme://test_url.com"
        file_path = files.blobstore.create("text/plain", url)
        with files.open(file_path, 'a') as fp:
            fp.write(url)
        files.finalize(file_path)
        blob_key = files.blobstore.get_blob_key(file_path)
        file_name = files.blobstore.get_file_name(blob_key)
        blob_keys.append(str(file_name))

        return blob_keys
Ejemplo n.º 16
0
  def createInvalidMockData(self):
    blob_keys = []
    url = "invalidScheme://test_url.com"
    file_path = files.blobstore.create("text/plain", url)
    with files.open(file_path, 'a') as fp:
      fp.write(url)
    files.finalize(file_path)
    blob_key = files.blobstore.get_blob_key(file_path)
    file_name = files.blobstore.get_file_name(blob_key)
    blob_keys.append(str(file_name))

    return blob_keys
Ejemplo n.º 17
0
 def createMockData(self, url_count, shard):
   blob_keys = []
   for num in range(shard):
     file_name = "myblob_%d" % num
     urls = "\n".join(["http://test_url_%d.com" % i for i in range(url_count)])
     file_path = files.blobstore.create("text/plain", file_name)
     with files.open(file_path, 'a') as fp:
       fp.write(urls)
     files.finalize(file_path)
     blob_key = files.blobstore.get_blob_key(file_path)
     file_name = files.blobstore.get_file_name(blob_key)
     blob_keys.append(str(file_name))
   return blob_keys
Ejemplo n.º 18
0
    def finalize_job(cls, mapreduce_state):
        """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
        state = BlobstoreOutputWriter._State.from_json(
            mapreduce_state.writer_state)
        files.finalize(state.filename)
        state.filename = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(state.filename))
        mapreduce_state.writer_state = state.to_json()
Ejemplo n.º 19
0
  def finalize_job(cls, mapreduce_state):
    """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
    state = BlobstoreOutputWriter._State.from_json(
        mapreduce_state.writer_state)
    files.finalize(state.filename)
    state.filename = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(state.filename))
    mapreduce_state.writer_state = state.to_json()
Ejemplo n.º 20
0
 def createMockData(self, url_count, shard):
     blob_keys = []
     for num in range(shard):
         file_name = "myblob_%d" % num
         urls = "\n".join(
             ["http://test_url_%d.com" % i for i in range(url_count)])
         file_path = files.blobstore.create("text/plain", file_name)
         with files.open(file_path, 'a') as fp:
             fp.write(urls)
         files.finalize(file_path)
         blob_key = files.blobstore.get_blob_key(file_path)
         file_name = files.blobstore.get_file_name(blob_key)
         blob_keys.append(str(file_name))
     return blob_keys
Ejemplo n.º 21
0
  def finalize_job(cls, mapreduce_state):
    """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
        job. State can be modified during finalization.
    """
    finalized_filenames = []
    for filename in mapreduce_state.writer_state["filenames"]:
      files.finalize(filename)
      finalized_filenames.append(
          files.blobstore.get_file_name(
              files.blobstore.get_blob_key(filename)))
    mapreduce_state.writer_state = {"filenames": finalized_filenames}
Ejemplo n.º 22
0
  def finalize(self, ctx, shard_number):
    """Finalize writer shard-level state.

    Args:
      ctx: an instance of context.Context.
      shard_number: shard number as integer.
    """
    mapreduce_spec = ctx.mapreduce_spec
    output_sharding = _get_output_sharding(mapper_spec=mapreduce_spec.mapper)
    if output_sharding == self.OUTPUT_SHARDING_INPUT_SHARDS:
      # Finalize our file because we're responsible for it.
      # Do it here and not in finalize_job to spread out finalization
      # into multiple tasks.
      files.finalize(self._filename)
Ejemplo n.º 23
0
  def finalize_job(cls, mapreduce_state):
    """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
        job. State can be modified during finalization.
    """
    finalized_filenames = []
    for filename in mapreduce_state.writer_state["filenames"]:
      files.finalize(filename)
      finalized_filenames.append(
          files.blobstore.get_file_name(
              files.blobstore.get_blob_key(filename)))
    mapreduce_state.writer_state = {"filenames": finalized_filenames}
Ejemplo n.º 24
0
  def callback(self, **kwargs):
    if "error" in kwargs:
      self.abort("Error from shuffle service: %s" % kwargs["error"])
      return

    output_files = self.outputs._output_files.value
    for filename in output_files:
      files.finalize(filename)

    finalized_file_names = []
    for filename in output_files:
      finalized_file_names.append(
          files.blobstore.get_file_name(
              files.blobstore.get_blob_key(filename)))
    self.complete(finalized_file_names)
Ejemplo n.º 25
0
    def callback(self, **kwargs):
        if "error" in kwargs:
            self.abort("Error from shuffle service: %s" % kwargs["error"])
            return

        output_files = self.outputs._output_files.value
        for filename in output_files:
            files.finalize(filename)

        finalized_file_names = []
        for filename in output_files:
            finalized_file_names.append(
                files.blobstore.get_file_name(
                    files.blobstore.get_blob_key(filename)))
        self.complete(finalized_file_names)
Ejemplo n.º 26
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [("1", "a"), ("2", "b"), ("3", "c")]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ("1", ["a"], True),
                ("1", ["a"], True),
                ("1", ["a"], False),
                ("2", ["b"], True),
                ("2", ["b"], True),
                ("2", ["b"], False),
                ("3", ["c"], True),
                ("3", ["c"], True),
                ("3", ["c"], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
Ejemplo n.º 28
0
  def createMockData(self, data):
    """Create mock data for FetchContentPipeline"""
    input_file = files.blobstore.create()
    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        key = str(data[0])
        value = str(data[1])
        proto = file_service_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))
    
    return input_file
Ejemplo n.º 29
0
    def createMockData(self, data):
        """Create mock data for FetchContentPipeline"""
        input_file = files.blobstore.create()
        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                key = str(data[0])
                value = str(data[1])
                proto = file_service_pb.KeyValue()
                proto.set_key(key)
                proto.set_value(value)
                w.write(proto.Encode())

        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        return input_file
Ejemplo n.º 30
0
def _sort_records(records):
    """Map function sorting records.

    Converts records to KeyValue protos, sorts them by key and writes them
    into new blobstore file. Creates _OutputFile entity to record resulting
    file name.

    Args:
      records: list of records which are serialized KeyValue protos.
    """
    ctx = context.get()
    l = len(records)
    proto_records = [None] * l

    # TODO(user): demote these log statements.
    logging.info("parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        proto_records[i] = proto

    logging.info("sorting")
    proto_records.sort(cmp=_compare_keys)

    logging.info("writing")
    blob_file_name = (ctx.mapreduce_spec.name + "-" +
                      ctx.mapreduce_id + "-output")
    output_path = files.blobstore.create(
        _blobinfo_uploaded_filename=blob_file_name)
    with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
        for proto in proto_records:
            pool.append(proto.Encode())

    logging.info("finalizing")
    files.finalize(output_path)
    output_path = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(output_path))

    entity = _OutputFile(key_name=output_path,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Ejemplo n.º 31
0
def _sort_records(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    proto_records = [None] * l

    # TODO(user): demote these log statements.
    logging.info("parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        proto_records[i] = proto

    logging.info("sorting")
    proto_records.sort(cmp=_compare_keys)

    logging.info("writing")
    blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id +
                      "-output")
    output_path = files.blobstore.create(
        _blobinfo_uploaded_filename=blob_file_name)
    with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
        for proto in proto_records:
            pool.append(proto.Encode())

    logging.info("finalizing")
    files.finalize(output_path)
    output_path = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(output_path))

    entity = _OutputFile(key_name=output_path,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Ejemplo n.º 32
0
 def requestCrawlerOutput(self):
     
     
     logging.debug("Request Crawler Output")
     
     file_content = 'DEFAULT_CONTENT'
     
     url = "http://web.ist.utl.pt/ist163512/crawler.txt"
     result = urllib2.urlopen(url)
     file_content = result.read()
     
     file_name = files.blobstore.create(mime_type='application/octet-stream')
     self.crawler_file_name = file_name
     
     with files.open(file_name, 'a') as f:
         f.write("%s" % file_content)
         
     files.finalize(file_name)
     logging.debug("File saved successfully")
     key = files.blobstore.get_blob_key(file_name)
     return key
Ejemplo n.º 33
0
    def finalize_job(cls, mapreduce_state):
        """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
        state = cls._State.from_json(mapreduce_state.writer_state)

        output_sharding = _get_output_sharding(mapreduce_state=mapreduce_state)

        finalized_filenames = []
        for filename in state.filenames:
            if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS:
                files.finalize(filename)
            finalized_filenames.append(
                files.blobstore.get_file_name(
                    files.blobstore.get_blob_key(filename)))

        state.filenames = finalized_filenames
        mapreduce_state.writer_state = state.to_json()
Ejemplo n.º 34
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  files.finalize(output_path)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Ejemplo n.º 35
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  files.finalize(output_path)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Ejemplo n.º 36
0
  def get(self):

    filekey = self.request.get("filekey")
    
    #key = ManageCrawlerOutput.requestCrawlerOutput(self)
#    str_key = str(key)

    self.response.headers['Content-Type'] = 'text/plain'
    
    query = db.GqlQuery("SELECT * FROM WebSiteInfo")
    
    file_name = files.blobstore.create(mime_type='application/octet-stream')

    with files.open(file_name, 'a') as f:              
        for q in query:
            w = WebSiteInfo()
            w=q
            title = str(w.title)
            title = re.sub(r" ", "", title)
            #title.replace(" ","-")
            self.response.out.write("parsed title %s\n" % title)
            f.write("%s %s\n" % (title,w.siteLinks))

    files.finalize(file_name)
    logging.debug("File saved successfully")
    
    key = files.blobstore.get_blob_key(file_name)

#    info = blobstore.get(key)
#    reader = info.open()
#    file_content = reader.read(501900)
#    self.response.out.write("\n\n")
#    self.response.out.write("%s" % file_content)


    pipeline = PageRankPipeline(filekey, str(key))      
    pipeline.start()
    
    self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
Ejemplo n.º 37
0
  def finalize_job(cls, mapreduce_state):
    """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
    state = cls._State.from_json(mapreduce_state.writer_state)
    output_sharding = cls._get_output_sharding(mapreduce_state=mapreduce_state)
    filesystem = cls._get_filesystem(mapreduce_state.mapreduce_spec.mapper)
    finalized_filenames = []
    for create_filename, request_filename in itertools.izip(
        state.filenames, state.request_filenames):
      if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS:
        files.finalize(create_filename)
      finalized_filenames.append(cls._get_finalized_filename(filesystem,
                                                             create_filename,
                                                             request_filename))

    state.filenames = finalized_filenames
    state.request_filenames = []
    mapreduce_state.writer_state = state.to_json()
Ejemplo n.º 38
0
  def finalize_job(cls, mapreduce_state):
    """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
    state = cls._State.from_json(
        mapreduce_state.writer_state)

    output_sharding = _get_output_sharding(mapreduce_state=mapreduce_state)

    finalized_filenames = []
    for filename in state.filenames:
      if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS:
        files.finalize(filename)
      finalized_filenames.append(
          files.blobstore.get_file_name(
              files.blobstore.get_blob_key(filename)))

    state.filenames = finalized_filenames
    mapreduce_state.writer_state = state.to_json()
    def testRecordsReader(self):
        """End-to-end test for records reader."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader", {"file": input_file},
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
Ejemplo n.º 40
0
    def finalize_job(cls, mapreduce_state):
        """Finalize job-level writer state.

    Args:
      mapreduce_state: an instance of model.MapreduceState describing current
      job.
    """
        state = cls._State.from_json(mapreduce_state.writer_state)
        output_sharding = cls._get_output_sharding(
            mapreduce_state=mapreduce_state)
        filesystem = cls._get_filesystem(mapreduce_state.mapreduce_spec.mapper)
        finalized_filenames = []
        for create_filename, request_filename in itertools.izip(
                state.filenames, state.request_filenames):
            if output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS:
                files.finalize(create_filename)
            finalized_filenames.append(
                cls._get_finalized_filename(filesystem, create_filename,
                                            request_filename))

        state.filenames = finalized_filenames
        state.request_filenames = []
        mapreduce_state.writer_state = state.to_json()
Ejemplo n.º 41
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
 def createGSData(self, file_path, data):
   write_path = files.gs.create(file_path, mime_type='text/plain', 
                                                 acl='public-read')
   with files.open(write_path, 'a') as fp:
     fp.write(data)
   files.finalize(write_path)
Ejemplo n.º 43
0
  def testReadPartial(self):
    input_file = files.blobstore.create()

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        # First record is full
        proto = file_service_pb.KeyValues()
        proto.set_key("key1")
        proto.value_list().extend(["a", "b"])
        w.write(proto.Encode())
        # Second record is partial
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    self.assertEquals(
        [("key1", ["a", "b"]),
         input_readers.ALLOW_CHECKPOINT,
         ("key2", ["a", "b", "c", "d"])],
        list(reader))

    # now test state serialization
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    self.assertEquals(
        {"position": 0,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    self.assertEquals(
        {"position": 19,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    self.assertEquals(
        {"position": 40,
         "current_values": ["a", "b"],
         "current_key": "key2",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    self.assertEquals(
        {"position": 59,
         "current_values": None,
         "current_key": None,
         "filenames": [input_file]},
        reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass

    # now do test deserialization at every moment.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass
  def testSingleRequest(self):
    """Tests when a key can be handled during a single request."""
    input_file = files.blobstore.create()

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        # First record is full
        proto = file_service_pb.KeyValues()
        proto.set_key("key1")
        proto.value_list().extend(["a", "b"])
        w.write(proto.Encode())
        # Second record is partial
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["a", "b"])
        proto.set_partial(True)
        w.write(proto.Encode())
        proto = file_service_pb.KeyValues()
        proto.set_key("key2")
        proto.value_list().extend(["c", "d"])
        w.write(proto.Encode())

    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    self.assertEquals(
        [("key1", ["a", "b"]),
         input_readers.ALLOW_CHECKPOINT,
         ("key2", ["a", "b", "c", "d"])],
        list(reader))

    # now test state serialization
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    self.assertEquals(
        {"position": 0,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    self.assertEquals(
        {"position": 19,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    self.assertEquals(
        {"position": 40,
         "current_values": "KGxwMApTJ2EnCnAxCmFTJ2InCnAyCmEu",
         "current_key": "UydrZXkyJwpwMAou",
         "filenames": [input_file]},
        reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    self.assertEquals(
        {"position": 59,
         "current_values": "Ti4=",
         "current_key": "Ti4=",
         "filenames": [input_file]},
        reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass

    # now do test deserialization at every moment.
    reader = mapreduce_pipeline._ReducerReader([input_file], 0)
    i = reader.__iter__()
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key1", ["a", "b"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next())
    reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json())

    try:
      i.next()
      self.fail("Exception expected")
    except StopIteration:
      # expected
      pass