コード例 #1
0
  def run(self,
          job_name,
          mapper_spec,
          reducer_spec,
          input_reader_spec,
          output_writer_spec=None,
          mapper_params=None,
          reducer_params=None,
          shards=None,
          combiner_spec=None):


    if mapper_params.get("bucket_name") is None:
      try:
        mapper_params["bucket_name"] = (
            app_identity.get_default_gcs_bucket_name())
      except Exception as e:
        raise errors.Error("Unable to get the GCS default bucket name. "
                           "Check to see that GCS is properly activated. "
                           + str(e))
    if mapper_params["bucket_name"] is None:
      raise errors.Error("There is no GCS default bucket name. "
                         "Check to see that GCS is properly activated.")


    map_pipeline = yield MapPipeline(job_name,
                                     mapper_spec,
                                     input_reader_spec,
                                     params=mapper_params,
                                     shards=shards)
    shuffler_pipeline = yield ShufflePipeline(
        job_name, mapper_params, map_pipeline)
    reducer_pipeline = yield ReducePipeline(
        job_name,
        reducer_spec,
        output_writer_spec,
        reducer_params,
        mapper_params["bucket_name"],
        shuffler_pipeline,
        combiner_spec=combiner_spec)
    with pipeline.After(reducer_pipeline):
      all_temp_files = yield pipeline_common.Extend(
          map_pipeline, shuffler_pipeline)
      yield CleanupPipeline(all_temp_files)

    yield _ReturnPipeline(map_pipeline.result_status,
                          reducer_pipeline.result_status,
                          reducer_pipeline)
コード例 #2
0
    def flush(self):
        """Flush pool contents."""

        buf = _StringWriter()
        with records.RecordsWriter(buf) as w:
            for record in self._buffer:
                w.write(record)

        str_buf = buf.to_string()
        if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE:

            raise errors.Error(
                "Buffer too big. Can't write more than %s bytes in one request: "
                "risk of writes interleaving. Got: %s" %
                (_FILES_API_MAX_SIZE, len(str_buf)))

        start_time = time.time()
        with files.open(self._filename, "a",
                        exclusive_lock=self._exclusive) as f:
            f.write(str_buf)
            if self._ctx:
                operation.counters.Increment(COUNTER_IO_WRITE_BYTES,
                                             len(str_buf))(self._ctx)
        if self._ctx:
            operation.counters.Increment(
                COUNTER_IO_WRITE_MSEC, int(
                    (time.time() - start_time) * 1000))(self._ctx)

        self._buffer = []
        self._size = 0
        gc.collect()
コード例 #3
0
    def flush(self):
        """Flush pool contents."""

        buf = cStringIO.StringIO()
        with records.RecordsWriter(buf) as w:
            for record in self._buffer:
                w.write(record)
            w._pad_block()
        str_buf = buf.getvalue()
        buf.close()

        if not self._exclusive and len(str_buf) > _FILE_POOL_MAX_SIZE:

            raise errors.Error(
                "Buffer too big. Can't write more than %s bytes in one request: "
                "risk of writes interleaving. Got: %s" %
                (_FILE_POOL_MAX_SIZE, len(str_buf)))

        start_time = time.time()
        self._write(str_buf)
        if self._ctx:
            operation.counters.Increment(COUNTER_IO_WRITE_BYTES,
                                         len(str_buf))(self._ctx)
            operation.counters.Increment(
                COUNTER_IO_WRITE_MSEC, int(
                    (time.time() - start_time) * 1000))(self._ctx)

        self._buffer = []
        self._size = 0
        gc.collect()
コード例 #4
0
    def append(self, filename, data):
        """Append data to a file."""
        if self._size + len(data) > self._max_size:
            self.flush()

        if len(data) > self._max_size:
            raise errors.Error(
                "Can't write more than %s bytes in one request: "
                "risk of writes interleaving." % self._max_size)
        else:
            self.__append(filename, data)

        if self._size > self._max_size:
            self.flush()
コード例 #5
0
    def append(self, data):
        """Append data to a file."""
        data_length = len(data)
        if self._size + data_length > self._flush_size:
            self.flush()

        if not self._exclusive and data_length > _FILES_API_MAX_SIZE:
            raise errors.Error("Too big input %s (%s)." %
                               (data_length, _FILES_API_MAX_SIZE))
        else:
            self._buffer.append(data)
            self._size += data_length

        if self._size > self._flush_size:
            self.flush()
コード例 #6
0
 def flush(self):
     """Flush pool contents."""
     start_time = time.time()
     for filename, data in self._append_buffer.iteritems():
         with files.open(filename, "a") as f:
             if len(data) > _FILES_API_MAX_SIZE:
                 raise errors.Error("Bad data of length: %s" % len(data))
             if self._ctx:
                 operation.counters.Increment(COUNTER_IO_WRITE_BYTES,
                                              len(data))(self._ctx)
             f.write(data)
     if self._ctx:
         operation.counters.Increment(
             COUNTER_IO_WRITE_MSEC, int(
                 (time.time() - start_time) * 1000))(self._ctx)
     self._append_buffer = {}
     self._size = 0
コード例 #7
0
  def _get_output_sharding(cls, mapreduce_state=None, mapper_spec=None):
    """Get output sharding parameter value from mapreduce state or mapper spec.

    At least one of the parameters should not be None.

    Args:
      mapreduce_state: mapreduce state as model.MapreduceState.
      mapper_spec: mapper specification as model.MapperSpec
    """
    if mapper_spec:
      return _get_params(mapper_spec).get(
          FileOutputWriterBase.OUTPUT_SHARDING_PARAM,
          FileOutputWriterBase.OUTPUT_SHARDING_NONE).lower()
    if mapreduce_state:
      mapper_spec = mapreduce_state.mapreduce_spec.mapper
      return cls._get_output_sharding(mapper_spec=mapper_spec)
    raise errors.Error("Neither mapreduce_state nor mapper_spec specified.")
コード例 #8
0
    def run(self,
            job_name,
            mapper_spec,
            reducer_spec,
            input_reader_spec,
            output_writer_spec=None,
            mapper_params=None,
            reducer_params=None,
            shards=None,
            combiner_spec=None):

        if mapper_params.get("bucket_name") is None:
            try:
                mapper_params["bucket_name"] = (
                    app_identity.get_default_gcs_bucket_name())
            except Exception, e:
                raise errors.Error(
                    "Unable to get the GCS default bucket name. "
                    "Check to see that GCS is properly activated. " + str(e))
コード例 #9
0
    def append(self, filename, data):
        """Append data to a file.

    Args:
      filename: the name of the file as string.
      data: data as string.
    """
        if self._size + len(data) > self._flush_size:
            self.flush()

        if len(data) > _FILES_API_MAX_SIZE:
            raise errors.Error(
                "Can't write more than %s bytes in one request: "
                "risk of writes interleaving." % _FILES_API_MAX_SIZE)
        else:
            self.__append(filename, data)

        if self._size > self._flush_size:
            self.flush()
コード例 #10
0
class MapreducePipeline(pipeline_base._OutputSlotsMixin,
                        pipeline_base.PipelineBase):
    """Pipeline to execute MapReduce jobs.

  The Shuffle stage uses Google Cloud Storage (GCS). For newly created projects,
  GCS is activated automatically. To activate GCS follow these instructions:
  https://cloud.google.com/storage/docs/signup#activate

  Args:
    job_name: job name as string.
    mapper_spec: specification of mapper to use.
    reducer_spec: specification of reducer to use.
    input_reader_spec: specification of input reader to read data from.
    output_writer_spec: specification of output writer to save reduce output to.
    mapper_params: parameters to use for mapper phase.
    reducer_params: parameters to use for reduce phase.
    shards: number of shards to use as int.
    combiner_spec: Optional. Specification of a combine function. If not
      supplied, no combine step will take place. The combine function takes a
      key, list of values and list of previously combined results. It yields
      combined values that might be processed by another combiner call, but will
      eventually end up in reducer. The combiner output key is assumed to be the
      same as the input key.

  Returns:
    result_status: one of model.MapreduceState._RESULTS. Check this to see
      if the job is successful.
    default: a list of filenames if the mapreduce was successful and
      was outputting files. An empty list otherwise.
  """
    def run(self,
            job_name,
            mapper_spec,
            reducer_spec,
            input_reader_spec,
            output_writer_spec=None,
            mapper_params=None,
            reducer_params=None,
            shards=None,
            combiner_spec=None):

        if mapper_params.get("bucket_name") is None:
            try:
                mapper_params["bucket_name"] = (
                    app_identity.get_default_gcs_bucket_name())
            except Exception, e:
                raise errors.Error(
                    "Unable to get the GCS default bucket name. "
                    "Check to see that GCS is properly activated. " + str(e))
        if mapper_params["bucket_name"] is None:
            raise errors.Error("There is no GCS default bucket name. "
                               "Check to see that GCS is properly activated.")

        map_pipeline = yield MapPipeline(job_name,
                                         mapper_spec,
                                         input_reader_spec,
                                         params=mapper_params,
                                         shards=shards)
        shuffler_pipeline = yield ShufflePipeline(job_name, mapper_params,
                                                  map_pipeline)
        reducer_pipeline = yield ReducePipeline(job_name,
                                                reducer_spec,
                                                output_writer_spec,
                                                reducer_params,
                                                shuffler_pipeline,
                                                combiner_spec=combiner_spec)
        with pipeline.After(reducer_pipeline):
            all_temp_files = yield pipeline_common.Extend(
                map_pipeline, shuffler_pipeline)
            yield CleanupPipeline(all_temp_files)

        yield _ReturnPipeline(map_pipeline.result_status,
                              reducer_pipeline.result_status, reducer_pipeline)