Example #1
0
    def next(self):
        """Returns a handler to the next file.

    Non existent files will be logged and skipped. The file might have been
    removed after input splitting.

    Returns:
      The next input from this input reader in the form of a cloudstorage
      ReadBuffer that supports a File-like interface (read, readline, seek,
      tell, and close). An error may be raised if the file can not be opened.

    Raises:
      StopIteration: The list of files has been exhausted.
    """
        options = {}
        if self._buffer_size:
            options["read_buffer_size"] = self._buffer_size
        if self._account_id:
            options["_account_id"] = self._account_id
        while True:
            filename = self._next_file()
            if filename is None:
                raise StopIteration()
            if self._path_filter and not self._path_filter.accept(self._slice_ctx, filename):
                continue
            try:
                start_time = time.time()
                handle = cloudstorage.open(filename, **options)
                self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC, int(time.time() - start_time) * 1000)
                self._slice_ctx.incr(self.COUNTER_FILE_READ)
                return handle
            except cloudstorage.NotFoundError:
                logging.warning("File %s may have been removed. Skipping file.", filename)
                self._slice_ctx.incr(self.COUNTER_FILE_MISSING)
Example #2
0
  def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
    """Inherit docs."""
    mapper_spec = mr_spec.mapper
    params = output_writers._get_params(mapper_spec)
    bucket_name = params.get(cls.BUCKET_NAME_PARAM)
    shards = mapper_spec.shard_count

    filehandles = []
    filename = (mr_spec.name + "/" + mr_spec.mapreduce_id +
                "/shard-" + str(shard_number) + "-bucket-")
    for i in range(shards):
      full_filename = "/%s/%s%d" % (bucket_name, filename, i)
      filehandles.append(cloudstorage.open(full_filename, mode="w"))
    return cls(filehandles)
Example #3
0
    def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
        """Inherit docs."""
        mapper_spec = mr_spec.mapper
        params = output_writers._get_params(mapper_spec)
        bucket_name = params.get(cls.BUCKET_NAME_PARAM)
        shards = mapper_spec.shard_count

        filehandles = []
        filename = (mr_spec.name + "/" + mr_spec.mapreduce_id + "/shard-" +
                    str(shard_number) + "-bucket-")
        for i in range(shards):
            full_filename = "/%s/%s%d" % (bucket_name, filename, i)
            filehandles.append(cloudstorage.open(full_filename, mode="w"))
        return cls(filehandles)
Example #4
0
  def _open_file(cls, writer_spec, filename_suffix, use_tmp_bucket=False):
    """Opens a new gcs file for writing."""
    if use_tmp_bucket:
      bucket = cls._get_tmp_gcs_bucket(writer_spec)
      account_id = cls._get_tmp_account_id(writer_spec)
    else:
      bucket = cls._get_gcs_bucket(writer_spec)
      account_id = cls._get_account_id(writer_spec)


    filename = "/%s/%s" % (bucket, filename_suffix)

    content_type = writer_spec.get(cls.CONTENT_TYPE_PARAM, None)

    options = {}
    if cls.ACL_PARAM in writer_spec:
      options["x-goog-acl"] = writer_spec.get(cls.ACL_PARAM)

    return cloudstorage.open(filename, mode="w", content_type=content_type,
                             options=options, _account_id=account_id)
Example #5
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  mapper_spec = ctx.mapreduce_spec.mapper
  params = input_readers._get_params(mapper_spec)
  bucket_name = params.get("bucket_name")
  filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
              ctx.shard_id + "-" + str(int(time.time())))
  full_filename = "/%s/%s" % (bucket_name, filename)
  filehandle = cloudstorage.open(full_filename, mode="w")
  with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  filehandle.close()

  entity = _OutputFile(key_name=full_filename,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Example #6
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = kv_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
    def _next_seg(self):
        """Get next seg."""
        if self._seg:
            self._seg.close()
        self._seg_index += 1
        if self._seg_index > self._last_seg_index:
            self._seg = None
            return

        filename = self._seg_prefix + str(self._seg_index)
        stat = cloudstorage.stat(filename)
        writer = output_writers._GoogleCloudStorageOutputWriter
        if writer._VALID_LENGTH not in stat.metadata:
            raise ValueError("Expect %s in metadata for file %s." %
                             (writer._VALID_LENGTH, filename))
        self._seg_valid_length = int(stat.metadata[writer._VALID_LENGTH])
        if self._seg_valid_length > stat.st_size:
            raise ValueError(
                "Valid length %s is too big for file %s of length %s" %
                (self._seg_valid_length, filename, stat.st_size))
        self._seg = cloudstorage.open(filename)
Example #8
0
    def _open_file(cls, writer_spec, filename_suffix, use_tmp_bucket=False):
        """Opens a new gcs file for writing."""
        if use_tmp_bucket:
            bucket = cls._get_tmp_gcs_bucket(writer_spec)
            account_id = cls._get_tmp_account_id(writer_spec)
        else:
            bucket = cls._get_gcs_bucket(writer_spec)
            account_id = cls._get_account_id(writer_spec)

        filename = "/%s/%s" % (bucket, filename_suffix)

        content_type = writer_spec.get(cls.CONTENT_TYPE_PARAM, None)

        options = {}
        if cls.ACL_PARAM in writer_spec:
            options["x-goog-acl"] = writer_spec.get(cls.ACL_PARAM)

        return cloudstorage.open(filename,
                                 mode="w",
                                 content_type=content_type,
                                 options=options,
                                 _account_id=account_id)
  def _next_seg(self):
    """Get next seg."""
    if self._seg:
      self._seg.close()
    self._seg_index += 1
    if self._seg_index > self._last_seg_index:
      self._seg = None
      return

    filename = self._seg_prefix + str(self._seg_index)
    stat = cloudstorage.stat(filename)
    writer = output_writers._GoogleCloudStorageOutputWriter
    if writer._VALID_LENGTH not in stat.metadata:
      raise ValueError(
          "Expect %s in metadata for file %s." %
          (writer._VALID_LENGTH, filename))
    self._seg_valid_length = int(stat.metadata[writer._VALID_LENGTH])
    if self._seg_valid_length > stat.st_size:
      raise ValueError(
          "Valid length %s is too big for file %s of length %s" %
          (self._seg_valid_length, filename, stat.st_size))
    self._seg = cloudstorage.open(filename)
Example #10
0
    def __next__(self):
        """Returns a handler to the next file.

    Non existent files will be logged and skipped. The file might have been
    removed after input splitting.

    Returns:
      The next input from this input reader in the form of a cloudstorage
      ReadBuffer that supports a File-like interface (read, readline, seek,
      tell, and close). An error may be raised if the file can not be opened.

    Raises:
      StopIteration: The list of files has been exhausted.
    """
        options = {}
        if self._buffer_size:
            options["read_buffer_size"] = self._buffer_size
        if self._account_id:
            options["_account_id"] = self._account_id
        while True:
            filename = self._next_file()
            if filename is None:
                raise StopIteration()
            if (self._path_filter and
                    not self._path_filter.accept(self._slice_ctx, filename)):
                continue
            try:
                start_time = time.time()
                handle = cloudstorage.open(filename, **options)
                self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC,
                                     int(time.time() - start_time) * 1000)
                self._slice_ctx.incr(self.COUNTER_FILE_READ)
                return handle
            except cloudstorage.NotFoundError:
                logging.warning(
                    "File %s may have been removed. Skipping file.", filename)
                self._slice_ctx.incr(self.COUNTER_FILE_MISSING)
Example #11
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx._shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")


    readers = []


    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]


      reader = records.RecordsReader(
          cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE))
      reader.seek(offset)
      readers.append((None, None, i, reader))





    current_result = None
    current_count = 0
    current_size = 0
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        current_count += 1
        current_size += len(value)

        should_yield = False
        if current_result:
          if key != current_result[0]:

            should_yield = True
          elif (self._max_values_count != -1 and
                current_count >= self._max_values_count):

            current_result[2] = True
            should_yield = True
          elif (self._max_values_size != -1 and
                current_size >= self._max_values_size):

            current_result[2] = True
            should_yield = True

        if should_yield:

          yield current_result
        if not current_result or should_yield:
          current_result = [key, [], False]
          current_count = 0
          current_size = 0
        current_result[1].append(value)


      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()

        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)

        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)


    if current_result:
      yield current_result
Example #12
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]

            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:

                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):

                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):

                        current_result[2] = True
                        should_yield = True

                if should_yield:

                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = kv_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result