Example #1
0
    def _recover(self, mr_spec, shard_number, shard_attempt):
        next_seg_index = self._seg_index

        # Save the current seg if it actually has something.
        # Remember self._streaming_buffer is the pickled instance
        # from the previous slice.
        if self._seg_valid_length != 0:
            try:
                gcs_next_offset = self._streaming_buffer._get_offset_from_gcs(
                ) + 1
                # If GCS is ahead of us, just force close.
                if gcs_next_offset > self._streaming_buffer.tell():
                    self._streaming_buffer._force_close(gcs_next_offset)
                # Otherwise flush in memory contents too.
                else:
                    self._streaming_buffer.close()
            except cloudstorage.FileClosedError:
                pass
            cloudstorage_api.copy2(
                self._streaming_buffer.name,
                self._streaming_buffer.name,
                metadata={self._VALID_LENGTH: self._seg_valid_length})
            next_seg_index = self._seg_index + 1

        writer_spec = self.get_params(mr_spec.mapper, allow_old=False)
        # Create name for the new seg.
        key = self._generate_filename(writer_spec, mr_spec.name,
                                      mr_spec.mapreduce_id, shard_number,
                                      shard_attempt, next_seg_index)
        new_writer = self._create(writer_spec, key)
        new_writer._seg_index = next_seg_index
        return new_writer
  def _recover(self, mr_spec, shard_number, shard_attempt):
    next_seg_index = self._seg_index

    # Save the current seg if it actually has something.
    # Remember self._streaming_buffer is the pickled instance
    # from the previous slice.
    if self._seg_valid_length != 0:
      try:
        gcs_next_offset = self._streaming_buffer._get_offset_from_gcs() + 1
        # If GCS is ahead of us, just force close.
        if gcs_next_offset > self._streaming_buffer.tell():
          self._streaming_buffer._force_close(gcs_next_offset)
        # Otherwise flush in memory contents too.
        else:
          self._streaming_buffer.close()
      except cloudstorage.FileClosedError:
        pass
      cloudstorage_api.copy2(
          self._streaming_buffer.name,
          self._streaming_buffer.name,
          metadata={self._VALID_LENGTH:
                    self._seg_valid_length})
      next_seg_index = self._seg_index + 1

    writer_spec = self.get_params(mr_spec.mapper, allow_old=False)
    # Create name for the new seg.
    key = self._generate_filename(
        writer_spec, mr_spec.name,
        mr_spec.mapreduce_id,
        shard_number,
        shard_attempt,
        next_seg_index)
    new_writer = self._create(writer_spec, key)
    new_writer._seg_index = next_seg_index
    return new_writer
Example #3
0
    def finalize(self, ctx, shard_state):
        self._streaming_buffer.close()

        if self._no_dup:
            cloudstorage_api.copy2(
                self._streaming_buffer.name,
                self._streaming_buffer.name,
                metadata={self._VALID_LENGTH: self._streaming_buffer.tell()})

            # The filename user requested.
            mr_spec = ctx.mapreduce_spec
            writer_spec = self.get_params(mr_spec.mapper, allow_old=False)
            filename = self._generate_filename(writer_spec, mr_spec.name,
                                               mr_spec.mapreduce_id,
                                               shard_state.shard_number)
            seg_filename = self._streaming_buffer.name
            prefix, last_index = seg_filename.rsplit("-", 1)
            # These info is enough for any external process to combine
            # all segs into the final file.
            # TODO(user): Create a special input reader to combine segs.
            shard_state.writer_state = {
                self._SEG_PREFIX: prefix + "-",
                self._LAST_SEG_INDEX: int(last_index),
                "filename": filename
            }
        else:
            shard_state.writer_state = {
                "filename": self._streaming_buffer.name
            }
  def finalize(self, ctx, shard_state):
    self._streaming_buffer.close()

    if self._no_dup:
      cloudstorage_api.copy2(
          self._streaming_buffer.name,
          self._streaming_buffer.name,
          metadata={self._VALID_LENGTH: self._streaming_buffer.tell()})

      # The filename user requested.
      mr_spec = ctx.mapreduce_spec
      writer_spec = self.get_params(mr_spec.mapper, allow_old=False)
      filename = self._generate_filename(writer_spec,
                                         mr_spec.name,
                                         mr_spec.mapreduce_id,
                                         shard_state.shard_number)
      seg_filename = self._streaming_buffer.name
      prefix, last_index = seg_filename.rsplit("-", 1)
      # These info is enough for any external process to combine
      # all segs into the final file.
      # TODO(user): Create a special input reader to combine segs.
      shard_state.writer_state = {self._SEG_PREFIX: prefix + "-",
                                  self._LAST_SEG_INDEX: int(last_index),
                                  "filename": filename}
    else:
      shard_state.writer_state = {"filename": self._streaming_buffer.name}
    def testCopy2(self):
        with cloudstorage.open(TESTFILE, 'w', 'text/foo',
                               {'x-goog-meta-foo': 'foo'}) as f:
            f.write('abcde')

        dst = TESTFILE + 'copy'
        self.assertRaises(cloudstorage.NotFoundError, cloudstorage.stat, dst)
        cloudstorage_api.copy2(TESTFILE, dst)

        src_stat = cloudstorage.stat(TESTFILE)
        dst_stat = cloudstorage.stat(dst)
        self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime)
        self.assertEqual(src_stat.st_size, dst_stat.st_size)
        self.assertEqual(src_stat.etag, dst_stat.etag)
        self.assertEqual(src_stat.content_type, dst_stat.content_type)
        self.assertEqual(src_stat.metadata, dst_stat.metadata)

        with cloudstorage.open(dst) as f:
            self.assertEqual('abcde', f.read())
  def testCopy2(self):
    with cloudstorage.open(TESTFILE, 'w',
                           'text/foo', {'x-goog-meta-foo': 'foo'}) as f:
      f.write('abcde')

    dst = TESTFILE + 'copy'
    self.assertRaises(cloudstorage.NotFoundError, cloudstorage.stat, dst)
    cloudstorage_api.copy2(TESTFILE, dst)

    src_stat = cloudstorage.stat(TESTFILE)
    dst_stat = cloudstorage.stat(dst)
    self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime)
    self.assertEqual(src_stat.st_size, dst_stat.st_size)
    self.assertEqual(src_stat.etag, dst_stat.etag)
    self.assertEqual(src_stat.content_type, dst_stat.content_type)
    self.assertEqual(src_stat.metadata, dst_stat.metadata)

    with cloudstorage.open(dst) as f:
      self.assertEqual('abcde', f.read())
  def testCopy2ReplacesMetadata(self):
    with cloudstorage.open(TESTFILE, 'w',
                           'text/foo', {'x-goog-meta-foo': 'foo'}) as f:
      f.write('abcde')
    src_stat = cloudstorage.stat(TESTFILE)

    cloudstorage_api.copy2(TESTFILE, TESTFILE,
                           metadata={'x-goog-meta-foo': 'bar',
                                     'content-type': 'text/bar'})

    dst_stat = cloudstorage.stat(TESTFILE)
    self.assertEqual(src_stat.st_size, dst_stat.st_size)
    self.assertEqual(src_stat.etag, dst_stat.etag)
    self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime)
    self.assertEqual('text/foo', src_stat.content_type)
    self.assertEqual('text/bar', dst_stat.content_type)
    self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo'])
    self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo'])

    with cloudstorage.open(TESTFILE) as f:
      self.assertEqual('abcde', f.read())
    def testCopy2ReplacesMetadata(self):
        with cloudstorage.open(TESTFILE, 'w', 'text/foo',
                               {'x-goog-meta-foo': 'foo'}) as f:
            f.write('abcde')
        src_stat = cloudstorage.stat(TESTFILE)

        cloudstorage_api.copy2(TESTFILE,
                               TESTFILE,
                               metadata={
                                   'x-goog-meta-foo': 'bar',
                                   'content-type': 'text/bar'
                               })

        dst_stat = cloudstorage.stat(TESTFILE)
        self.assertEqual(src_stat.st_size, dst_stat.st_size)
        self.assertEqual(src_stat.etag, dst_stat.etag)
        self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime)
        self.assertEqual('text/foo', src_stat.content_type)
        self.assertEqual('text/bar', dst_stat.content_type)
        self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo'])
        self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo'])

        with cloudstorage.open(TESTFILE) as f:
            self.assertEqual('abcde', f.read())
Example #9
0
 def copy_to(path, target_path):
     return cloudstorage_api.copy2(path, target_path)
Example #10
0
 def copy_to(path, target_path):
     return cloudstorage_api.copy2(path, target_path)