def stat(filename, retry_params=None, _account_id=None): """Get GCSFileStat of a Google Cloud storage file. Args: filename: A Google Cloud Storage filename of form '/bucket/filename'. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. _account_id: Internal-use only. Returns: a GCSFileStat object containing info about this file. Raises: errors.AuthorizationError: if authorization failed. errors.NotFoundError: if an object that's expected to exist doesn't. """ common.validate_file_path(filename) api = storage_api._get_storage_api(retry_params=retry_params, account_id=_account_id) status, headers, content = api.head_object( api_utils._quote_filename(filename)) errors.check_status(status, [200], filename, resp_headers=headers, body=content) file_stat = common.GCSFileStat( filename=filename, st_size=common.get_stored_content_length(headers), st_ctime=common.http_time_to_posix(headers.get('last-modified')), etag=headers.get('etag'), content_type=headers.get('content-type'), metadata=common.get_metadata(headers)) return file_stat
def _copy2(src, dst, metadata=None, retry_params=None): """Copy the file content from src to dst. Internal use only! Args: src: /bucket/filename dst: /bucket/filename metadata: a dict of metadata for this copy. If None, old metadata is copied. For example, {'x-goog-meta-foo': 'bar'}. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. Raises: errors.AuthorizationError: if authorization failed. errors.NotFoundError: if an object that's expected to exist doesn't. """ common.validate_file_path(src) common.validate_file_path(dst) if metadata is None: metadata = {} copy_meta = 'COPY' else: copy_meta = 'REPLACE' metadata.update({'x-goog-copy-source': src, 'x-goog-metadata-directive': copy_meta}) api = storage_api._get_storage_api(retry_params=retry_params) status, resp_headers, content = api.put_object( api_utils._quote_filename(dst), headers=metadata) errors.check_status(status, [200], src, metadata, resp_headers, body=content)
def open(filename, mode='r', content_type=None, options=None, read_buffer_size=storage_api.ReadBuffer.DEFAULT_BUFFER_SIZE, retry_params=None, _account_id=None): """Opens a Google Cloud Storage file and returns it as a File-like object. Args: filename: A Google Cloud Storage filename of form '/bucket/filename'. mode: 'r' for reading mode. 'w' for writing mode. In reading mode, the file must exist. In writing mode, a file will be created or be overrode. content_type: The MIME type of the file. str. Only valid in writing mode. options: A str->basestring dict to specify additional headers to pass to GCS e.g. {'x-goog-acl': 'private', 'x-goog-meta-foo': 'foo'}. Supported options are x-goog-acl, x-goog-meta-, cache-control, content-disposition, and content-encoding. Only valid in writing mode. See https://developers.google.com/storage/docs/reference-headers for details. read_buffer_size: The buffer size for read. Read keeps a buffer and prefetches another one. To minimize blocking for large files, always read by buffer size. To minimize number of RPC requests for small files, set a large buffer size. Max is 30MB. retry_params: An instance of api_utils.RetryParams for subsequent calls to GCS from this file handle. If None, the default one is used. _account_id: Internal-use only. Returns: A reading or writing buffer that supports File-like interface. Buffer must be closed after operations are done. Raises: errors.AuthorizationError: if authorization failed. errors.NotFoundError: if an object that's expected to exist doesn't. ValueError: invalid open mode or if content_type or options are specified in reading mode. """ common.validate_file_path(filename) api = storage_api._get_storage_api(retry_params=retry_params, account_id=_account_id) filename = api_utils._quote_filename(filename) if mode == 'w': common.validate_options(options) return storage_api.StreamingBuffer(api, filename, content_type, options) elif mode == 'r': if content_type or options: raise ValueError('Options and content_type can only be specified ' 'for writing mode.') return storage_api.ReadBuffer(api, filename, buffer_size=read_buffer_size) else: raise ValueError('Invalid mode %s.' % mode)
def delete(filename, retry_params=None, _account_id=None): """Delete a Google Cloud Storage file. Args: filename: A Google Cloud Storage filename of form '/bucket/filename'. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. _account_id: Internal-use only. Raises: errors.NotFoundError: if the file doesn't exist prior to deletion. """ api = storage_api._get_storage_api(retry_params=retry_params, account_id=_account_id) common.validate_file_path(filename) filename = api_utils._quote_filename(filename) status, resp_headers, content = api.delete_object(filename) errors.check_status(status, [204], filename, resp_headers=resp_headers, body=content)
def testValidatePath(self): self.assertRaises(ValueError, common.validate_bucket_path, '/bucke*') self.assertRaises(ValueError, common.validate_file_path, None) self.assertRaises(ValueError, common.validate_file_path, '/bucketabcd') self.assertRaises(TypeError, common.validate_file_path, 1) common.validate_file_path('/bucket/file') common.validate_file_path('/bucket/dir/dir2/file') common.validate_file_path('/bucket/dir/dir2/file' + 'c' * 64)
def testValidatePath(self): self.assertRaises(ValueError, common.validate_bucket_path, "/bucke*") self.assertRaises(ValueError, common.validate_file_path, None) self.assertRaises(ValueError, common.validate_file_path, "/bucketabcd") self.assertRaises(TypeError, common.validate_file_path, 1) common.validate_file_path("/bucket/file") common.validate_file_path("/bucket/dir/dir2/file") common.validate_file_path("/bucket/dir/dir2/file" + "c" * 64)
def _copy2(src, dst, metadata=None, retry_params=None): """Copy the file content from src to dst. Internal use only! Args: src: /bucket/filename dst: /bucket/filename metadata: a dict of metadata for this copy. If None, old metadata is copied. For example, {'x-goog-meta-foo': 'bar'}. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. Raises: errors.AuthorizationError: if authorization failed. errors.NotFoundError: if an object that's expected to exist doesn't. """ common.validate_file_path(src) common.validate_file_path(dst) if metadata is None: metadata = {} copy_meta = 'COPY' else: copy_meta = 'REPLACE' metadata.update({ 'x-goog-copy-source': src, 'x-goog-metadata-directive': copy_meta }) api = storage_api._get_storage_api(retry_params=retry_params) status, resp_headers, content = api.put_object( api_utils._quote_filename(dst), headers=metadata) errors.check_status(status, [200], src, metadata, resp_headers, body=content)
def compose(list_of_files, destination_file, preserve_order=True, content_type=None, retry_params=None, _account_id=None): """ Internal only! Should only be used when the included cloudstorage lib does not contain the compose functionality Runs the GCS Compose on the inputed files. Merges between 2 and 1024 files into one file. Automatically breaks down the files into batches of 32. There is an option to sort naturally. Args: list_of_files: list of dictionaries with the following format: {"file_name" : REQUIRED name of the file to be merged. Do not include the bucket name, "Generation" : OPTIONAL Used to specify what version of a file to use, "IfGenerationMatch" : OPTIONAL Used to fail requests if versions don't match} destination_file: Path to the desired output file. Must have the bucket in the path. preserve_order: If true will not sort the files into natural order. content_type: Used to specify the content-header of the output. If None will try to guess based off the first file. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. _account_id: Internal-use only. Raises: TypeError: If the dictionary for the file list is malformed ValueError: If the number of files is outside the range of 2-1024 errors.NotFoundError: If any element in the file list is missing the "file_name" key """ def _alphanum_key(input_string): """ Internal use only. Splits the file names up to allow natural sorting """ return [ int(char) if char.isdigit() else char for char in re.split('([0-9]+)', input_string) ] # pylint: disable=too-many-locals def _make_api_call(bucket, file_list, destination_file, content_type, retry_params, _account_id): """ Internal Only Makes the actual calls. Currently stubbed because the dev server cloudstorage_stub.py does not handle compose requests. TODO: When the dev server gets patch please remove the stub Args: bucket: Bucket where the files are kept file_list: list of dicts with the file name (see compose argument "list_of_files" for format). destination_file: Path to the destination file. content_type: Content type for the destination file. retry_params: An api_utils.RetryParams for this call to GCS. If None, the default one is used. _account_id: Internal-use only. """ if len(file_list) == 0: raise ValueError("Unable to merge 0 files") if len(file_list) == 1: _copy2(bucket + file_list[0]["file_name"], destination_file) return ''' Needed until cloudstorage_stub.py is updated to accept compose requests TODO: When patched remove the True flow from this if. ''' if 'development' in os.environ.get('SERVER_SOFTWARE', '').lower(): ''' Below is making the call to the Development server ''' with open(destination_file, "w", content_type=content_type) as gcs_merge: for source_file in file_list: try: with open(bucket + source_file['file_name'], "r") as gcs_source: gcs_merge.write(gcs_source.read()) except cloud_errors.NotFoundError: logging.warn("File not found %s, skipping", source_file['file_name']) else: ''' Below is making the call to the Production server ''' xml = "" for item in file_list: generation = item.get("Generation", "") generation_match = item.get("IfGenerationMatch", "") if generation != "": generation = "<Generation>%s</Generation>" % generation if generation_match != "": generation_match = "<IfGenerationMatch>%s</IfGenerationMatch>" % generation_match xml += "<Component><Name>%s</Name>%s%s</Component>" % \ (item["file_name"], generation, generation_match) xml = "<ComposeRequest>%s</ComposeRequest>" % xml logging.info(xml) # pylint: disable=protected-access api = cloudstorage.storage_api._get_storage_api(retry_params=retry_params, account_id=_account_id) headers = {"Content-Type" : content_type} # pylint: disable=no-member status, resp_headers, content = api.put_object( cloudstorage.api_utils._quote_filename(destination_file) + "?compose", payload=xml, headers=headers) # TODO: confirm whether [200] is sufficient, or if 204 etc. might be returned? cloud_errors.check_status(status, [200], destination_file, resp_headers, body=content) ''' Actual start of the compose call. The above is inside to prevent calls to it directly ''' temp_file_suffix = "____MergeTempFile" # Make a copy of the list as they are passed ref file_list = list_of_files[:] if not isinstance(file_list, list): raise TypeError("file_list must be a list of dictionaries") list_len = len(file_list) if list_len > 1024: raise ValueError( "Compose attempted to create composite with too many (%i) components; limit is (1024)." \ % list_len) if list_len <= 1: raise ValueError("Compose operation requires at least two components; %i provided." % list_len) common.validate_file_path(destination_file) bucket = "/" + destination_file.split("/")[1] + "/" for source_file in file_list: if not isinstance(source_file, dict): raise TypeError("Each item of file_list must be dictionary") file_name = source_file.get("file_name", None) if file_name is None: raise cloud_errors.NotFoundError("Each item in file_list must specify a file_name") if file_name.startswith(bucket): logging.warn("Detected bucket name at the start of the file, " + \ "must not specify the bucket when listing file_names." + \ " May cause files to be miss read") common.validate_file_path(bucket + source_file['file_name']) if content_type is None: if file_exists(bucket + list_of_files[0]["file_name"]): content_type = cloudstorage.stat(bucket + list_of_files[0]["file_name"]).content_type else: logging.warn("Unable to read first file to divine content type, using text/plain") content_type = "text/plain" # Sort naturally if the flag is false if not preserve_order: file_list.sort(key=lambda x: _alphanum_key(x['file_name'])) ''' Compose can only handle 32 files at a time. Breaks down the list into batches of 32 (this will only need to happen once, since the file_list size restriction is 1024 = 32 * 32) ''' temp_list = [] # temporary storage for the filenames that store the merged segments of 32 if len(file_list) > 32: temp_file_counter = 0 segments_list = [file_list[i:i + 32] for i in range(0, len(file_list), 32)] file_list = [] for segment in segments_list: temp_file_name = destination_file + temp_file_suffix + str(temp_file_counter) _make_api_call(bucket, segment, temp_file_name, content_type, retry_params, _account_id) file_list.append({"file_name" : temp_file_name.replace(bucket, "", 1)}) temp_file_counter += 1 temp_list.append(temp_file_name) # There will always be 32 or less files to merge at this point _make_api_call(bucket, file_list, destination_file, content_type, retry_params, _account_id) # grab all temp files that were created during the merging of segments of 32 temp_list = cloudstorage.listbucket(destination_file + temp_file_suffix) # delete all the now-unneeded temporary merge-files for the segments of 32 (if any) for item in temp_list: try: cloudstorage.delete(item.filename) except cloud_errors.NotFoundError: pass
def _validate_compose_list(destination_file, file_list, files_metadata=None, number_of_files=32): """Validates the file_list and merges the file_list, files_metadata. Args: destination: Path to the file (ie. /destination_bucket/destination_file). file_list: List of files to compose, see compose for details. files_metadata: Meta details for each file in the file_list. number_of_files: Maximum number of files allowed in the list. Returns: A tuple (list_of_files, bucket): list_of_files: Ready to use dict version of the list. bucket: bucket name extracted from the file paths. """ common.validate_file_path(destination_file) bucket = destination_file[0:(destination_file.index('/', 1) + 1)] try: if isinstance(file_list, types.StringTypes): raise TypeError list_len = len(file_list) except TypeError: raise TypeError('file_list must be a list') if list_len > number_of_files: raise ValueError('Compose attempted to create composite with too many' '(%i) components; limit is (%i).' % (list_len, number_of_files)) if list_len <= 1: raise ValueError('Compose operation requires at' ' least two components; %i provided.' % list_len) if files_metadata is None: files_metadata = [] elif len(files_metadata) > list_len: raise ValueError('files_metadata contains more entries(%i)' ' than file_list(%i)' % (len(files_metadata), list_len)) list_of_files = [] for source_file, meta_data in itertools.izip_longest( file_list, files_metadata): if not isinstance(source_file, basestring): raise TypeError('Each item of file_list must be a string') if source_file.startswith('/'): log.warn('Detected a "/" at the start of the file, ' 'Unless the file name contains a "/" it ' ' may cause files to be misread') if source_file.startswith(bucket): log.warn('Detected bucket name at the start of the file, ' 'must not specify the bucket when listing file_names.' ' May cause files to be misread') common.validate_file_path(bucket + source_file) list_entry = {} if meta_data is not None: list_entry.update(meta_data) list_entry["Name"] = source_file list_of_files.append(list_entry) return list_of_files, bucket