def finalize(self, ctx, shard_state): self._streaming_buffer.close() if self._no_dup: # TODO(user): This doesn't work properly when the filenames have # spaces in them. It's not being re-quoted properly. b/12066572 cloudstorage_api._copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._streaming_buffer.tell()}) # The filename user requested. mr_spec = ctx.mapreduce_spec writer_spec = _get_params(mr_spec.mapper, allow_old=False) filename = self._generate_filename(writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_state.shard_number) seg_filename = self._streaming_buffer.name prefix, last_index = seg_filename.rsplit("-", 1) # These info is enough for any external process to combine # all segs into the final file. # TODO(user): Create a special input reader to combine segs. shard_state.writer_state = {self._SEG_PREFIX: prefix + "-", self._LAST_SEG_INDEX: int(last_index), "filename": filename} else: shard_state.writer_state = {"filename": self._streaming_buffer.name}
def _recover(self, mr_spec, shard_number, shard_attempt): next_seg_index = self._seg_index # Save the current seg if it actually has something. # Remember self._streaming_buffer is the pickled instance # from the previous slice. if self._seg_valid_length != 0: try: gcs_next_offset = self._streaming_buffer._get_offset_from_gcs() + 1 # If GCS is ahead of us, just force close. if gcs_next_offset > self._streaming_buffer.tell(): self._streaming_buffer._force_close(gcs_next_offset) # Otherwise flush in memory contents too. else: self._streaming_buffer.close() except cloudstorage.FileClosedError: pass cloudstorage_api._copy2( self._streaming_buffer.name, self._streaming_buffer.name, metadata={self._VALID_LENGTH: self._seg_valid_length}) next_seg_index = self._seg_index + 1 writer_spec = _get_params(mr_spec.mapper, allow_old=False) # Create name for the new seg. key = self._generate_filename( writer_spec, mr_spec.name, mr_spec.mapreduce_id, shard_number, shard_attempt, next_seg_index) new_writer = self._create(writer_spec, key) new_writer._seg_index = next_seg_index return new_writer
def testCopy2(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') dst = TESTFILE + 'copy' self.assertRaises(cloudstorage.NotFoundError, cloudstorage.stat, dst) cloudstorage_api._copy2(TESTFILE, dst) src_stat = cloudstorage.stat(TESTFILE) dst_stat = cloudstorage.stat(dst) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.content_type, dst_stat.content_type) self.assertEqual(src_stat.metadata, dst_stat.metadata) with cloudstorage.open(dst) as f: self.assertEqual('abcde', f.read())
def testCopy2ReplacesMetadata(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') src_stat = cloudstorage.stat(TESTFILE) cloudstorage_api._copy2(TESTFILE, TESTFILE, metadata={'x-goog-meta-foo': 'bar', 'content-type': 'text/bar'}) dst_stat = cloudstorage.stat(TESTFILE) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual('text/foo', src_stat.content_type) self.assertEqual('text/bar', dst_stat.content_type) self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo']) self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo']) with cloudstorage.open(TESTFILE) as f: self.assertEqual('abcde', f.read())
def testCopy2ReplacesMetadata(self): with cloudstorage.open(TESTFILE, 'w', 'text/foo', {'x-goog-meta-foo': 'foo'}) as f: f.write('abcde') src_stat = cloudstorage.stat(TESTFILE) cloudstorage_api._copy2(TESTFILE, TESTFILE, metadata={ 'x-goog-meta-foo': 'bar', 'content-type': 'text/bar' }) dst_stat = cloudstorage.stat(TESTFILE) self.assertEqual(src_stat.st_size, dst_stat.st_size) self.assertEqual(src_stat.etag, dst_stat.etag) self.assertEqual(src_stat.st_ctime, dst_stat.st_ctime) self.assertEqual('text/foo', src_stat.content_type) self.assertEqual('text/bar', dst_stat.content_type) self.assertEqual('foo', src_stat.metadata['x-goog-meta-foo']) self.assertEqual('bar', dst_stat.metadata['x-goog-meta-foo']) with cloudstorage.open(TESTFILE) as f: self.assertEqual('abcde', f.read())
def copy_to(path, target_path): # TODO(jeremydw): Replace return cloudstorage_api._copy2(path, target_path)
def _make_api_call(bucket, file_list, destination_file, content_type, retry_params, _account_id): """ Internal Only Makes the actual calls. Currently stubbed because the dev server cloudstorage_stub.py does not handle compose requests. TODO: When the dev server gets patch please remove the stub Args: bucket: Bucket where the files are kept file_list: list of dicts with the file name (see compose argument "list_of_files" for format). destination_file: Path to the destination file. content_type: Content type for the destination file. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. _account_id: Internal-use only. """ if len(file_list) == 0: raise ValueError("Unable to merge 0 files") if len(file_list) == 1: _copy2(bucket + file_list[0]["file_name"], destination_file) return ''' Needed until cloudstorage_stub.py is updated to accept compose requests TODO: When patched remove the True flow from this if. ''' if 'development' in os.environ.get('SERVER_SOFTWARE', '').lower(): ''' Below is making the call to the Development server ''' with open(destination_file, "w", content_type=content_type) as gcs_merge: for source_file in file_list: try: with open(bucket + source_file['file_name'], "r") as gcs_source: gcs_merge.write(gcs_source.read()) except cloud_errors.NotFoundError: logging.warn("File not found %s, skipping", source_file['file_name']) else: ''' Below is making the call to the Production server ''' xml = "" for item in file_list: generation = item.get("Generation", "") generation_match = item.get("IfGenerationMatch", "") if generation != "": generation = "<Generation>%s</Generation>" % generation if generation_match != "": generation_match = "<IfGenerationMatch>%s</IfGenerationMatch>" % generation_match xml += "<Component><Name>%s</Name>%s%s</Component>" % \ (item["file_name"], generation, generation_match) xml = "<ComposeRequest>%s</ComposeRequest>" % xml logging.info(xml) # pylint: disable=protected-access api = cloudstorage.storage_api._get_storage_api(retry_params=retry_params, account_id=_account_id) headers = {"Content-Type" : content_type} # pylint: disable=no-member status, resp_headers, content = api.put_object( cloudstorage.api_utils._quote_filename(destination_file) + "?compose", payload=xml, headers=headers) # TODO: confirm whether [200] is sufficient, or if 204 etc. might be returned? cloud_errors.check_status(status, [200], destination_file, resp_headers, body=content)