def _recover(self, mr_spec, shard_number, shard_attempt): next_seg_index = self._seg_index # Save the current seg if it actually has something. # Remember self._streaming_buffer is the pickled instance # from the previous slice. if self._seg_valid_length != 0: try: gcs_next_offset = self._streaming_buffer._get_offset_from_gcs( ) + 1 # If GCS is ahead of us, just force close. if gcs_next_offset > self._streaming_buffer.tell(): self._streaming_buffer._force_close(gcs_next_offset) # Otherwise flush in memory contents too. else: self._streaming_buffer.close() except cloudstorage.FileClosedError: pass cloudstorage_api.copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._seg_valid_length}) next_seg_index = self._seg_index + 1 writer_spec = self.get_params(mr_spec.mapper, allow_old=False) # Create name for the new seg. key = self._generate_filename(writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_number, shard_attempt, next_seg_index) new_writer = self._create(writer_spec, key) new_writer._seg_index = next_seg_index return new_writer
def _recover(self, mr_spec, shard_number, shard_attempt): next_seg_index = self._seg_index # Save the current seg if it actually has something. # Remember self._streaming_buffer is the pickled instance # from the previous slice. if self._seg_valid_length != 0: try: gcs_next_offset = self._streaming_buffer._get_offset_from_gcs() + 1 # If GCS is ahead of us, just force close. if gcs_next_offset > self._streaming_buffer.tell(): self._streaming_buffer._force_close(gcs_next_offset) # Otherwise flush in memory contents too. else: self._streaming_buffer.close() except cloudstorage.FileClosedError: pass cloudstorage_api.copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._seg_valid_length}) next_seg_index = self._seg_index + 1 writer_spec = self.get_params(mr_spec.mapper, allow_old=False) # Create name for the new seg. key = self._generate_filename( writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_number, shard_attempt, next_seg_index) new_writer = self._create(writer_spec, key) new_writer._seg_index = next_seg_index return new_writer
def finalize(self, ctx, shard_state): self._streaming_buffer.close() if self._no_dup: cloudstorage_api.copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._streaming_buffer.tell()}) # The filename user requested. mr_spec = ctx.mapreduce_spec writer_spec = self.get_params(mr_spec.mapper, allow_old=False) filename = self._generate_filename(writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_state.shard_number) seg_filename = self._streaming_buffer.name prefix, last_index = seg_filename.rsplit("-", 1) # These info is enough for any external process to combine # all segs into the final file. # TODO(user): Create a special input reader to combine segs. shard_state.writer_state = { self._SEG_PREFIX: prefix + "-", self._LAST_SEG_INDEX: int(last_index), "filename": filename } else: shard_state.writer_state = { "filename": self._streaming_buffer.name }
def finalize(self, ctx, shard_state): self._streaming_buffer.close() if self._no_dup: cloudstorage_api.copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._streaming_buffer.tell()}) # The filename user requested. mr_spec = ctx.mapreduce_spec writer_spec = self.get_params(mr_spec.mapper, allow_old=False) filename = self._generate_filename(writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_state.shard_number) seg_filename = self._streaming_buffer.name prefix, last_index = seg_filename.rsplit("-", 1) # These info is enough for any external process to combine # all segs into the final file. # TODO(user): Create a special input reader to combine segs. shard_state.writer_state = {self._SEG_PREFIX: prefix + "-", self._LAST_SEG_INDEX: int(last_index), "filename": filename} else: shard_state.writer_state = {"filename": self._streaming_buffer.name}
def testCopy2(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') dst = TESTFILE + 'copy' self.assertRaises(cloudstorage.NotFoundError, cloudstorage.stat, dst) cloudstorage_api.copy2(TESTFILE, dst) src_stat = cloudstorage.stat(TESTFILE) dst_stat = cloudstorage.stat(dst) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.content_type, dst_stat.content_type) self.assertEqual(src_stat.metadata, dst_stat.metadata) with cloudstorage.open(dst) as f: self.assertEqual('abcde', f.read())
def testCopy2ReplacesMetadata(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') src_stat = cloudstorage.stat(TESTFILE) cloudstorage_api.copy2(TESTFILE, TESTFILE, metadata={'x-goog-meta-foo': 'bar', 'content-type': 'text/bar'}) dst_stat = cloudstorage.stat(TESTFILE) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual('text/foo', src_stat.content_type) self.assertEqual('text/bar', dst_stat.content_type) self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo']) self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo']) with cloudstorage.open(TESTFILE) as f: self.assertEqual('abcde', f.read())
def testCopy2ReplacesMetadata(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') src_stat = cloudstorage.stat(TESTFILE) cloudstorage_api.copy2(TESTFILE, TESTFILE, metadata={ 'x-goog-meta-foo': 'bar', 'content-type': 'text/bar' }) dst_stat = cloudstorage.stat(TESTFILE) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual('text/foo', src_stat.content_type) self.assertEqual('text/bar', dst_stat.content_type) self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo']) self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo']) with cloudstorage.open(TESTFILE) as f: self.assertEqual('abcde', f.read())
def copy_to(path, target_path): return cloudstorage_api.copy2(path, target_path)