def list(prefix=None, folderize=False, marker=None, max_keys=1000): """ Retrieve obj_paths of all objects stored on PiCloud. Returns a list of keys in lexicographic order. * ``prefix``: Return only keys beginning with prefix. * ``folderize``: Treat listing as directory based; compact keys containing "/" into a single folder (a key is a folder if and only if it ends in "/") A folder can then be inspected by setting prefix equal to the folder name * marker: Return only keys with where key > marker * max_keys: Maximum number of keys that can be returned (max 1000). Fewer may be returned The list will have an attribute, *truncated*, that indicates if the listing is truncated. To see the next results, make a subsequent list query with marker set to list[-1] Use *iterlist* to avoid truncation """ conn = _getcloudnetconnection() if max_keys > 1000: max_keys = 1000 resp = conn.send_request(_bucket_list_query, {'prefix': prefix, 'delimiter': '/' if folderize else None, 'marker': marker, 'max_keys': max_keys}) files = TruncatableList(resp['files']) truncated = resp['truncated'] files.truncated = truncated return files
def get_md5(name, log_missing_file_error = True): """Return the md5 checksum of the file named ``name`` stored on PiCloud""" conn = _getcloudnetconnection() resp = conn.send_request(_file_md5_query, {'name': name}, log_cloud_excp = log_missing_file_error) md5sum = resp['md5sum'] return md5sum
def exists(name): """Check if a file named ``name`` is stored on PiCloud.""" conn = _getcloudnetconnection() resp = conn.send_request(_file_exists_query, {'name': name}) exists = resp['exists'] return exists
def delete(name): """Deletes the file named ``name`` from PiCloud.""" conn = _getcloudnetconnection() resp = conn.send_request(_file_delete_query, {'name': name}) deleted = resp['deleted'] return deleted
def getf(obj_path, prefix=None, start_byte=0, end_byte=None): """ Retrieve the object referenced by ``effective_obj_path`` from PiCloud. Return value is a CloudBucketObject (file-like object) that can be read() to retrieve the object's contents An optional byte_range can be specified using ``start_byte`` and ``end_byte``, where only the data between ``start_byte`` and ``end_byte`` is returned and made accessible to the CloudBucketObject. The returned CloudBucketObject.tell() will initialized to ``start_byte``. An ``end_byte`` of None or exceeding file size is interpreted as a request to retrieve to end of file. """ full_obj_path = _get_effective_obj_path(obj_path, prefix) conn = _getcloudnetconnection() resp = conn.send_request(_bucket_get_query, {'name': full_obj_path}) ticket = resp['ticket'] params = resp['params'] file_size = params['size'] if not start_byte: start_byte = 0 if file_size and (not end_byte or end_byte > file_size): end_byte = file_size cloud_file = CloudBucketObject( params['action'], ticket, file_size, start_byte, end_byte ) return cloud_file
def __connect(self): """Connect to S3""" if self.__http_response: self.__http_response.close() self.__ticket['Range'] = 'bytes=%s-%s' % tuple( [self.__pos, self.__end_byte] ) conn = _getcloudnetconnection() self.__http_response = _aws_retryable_post(conn, self.__action, None, self.__ticket)
def _file_info(name): """ get information about name """ conn = _getcloudnetconnection() resp = conn.send_request(_file_exists_query, {'name':name}) return resp
def public_url_folder(): """Return HTTP path that begins all your public bucket URLs. e.g. object 'foo' (if is_public) will be found at public_url_folder() + foo """ conn = _getcloudnetconnection() resp = conn.send_request(_bucket_public_url_folder_query, {}) return S3_URL+resp['url']
def exists(obj_path, prefix=None): """Return boolean indicating if PiCloud bucket object named ``effective_obj_path`` exists""" conn = _getcloudnetconnection() full_obj_path = _get_effective_obj_path(obj_path, prefix) resp = conn.send_request(_bucket_exists_query, {'name': full_obj_path}) exists = resp['exists'] return exists
def list(): """List all files stored on PiCloud.""" conn = _getcloudnetconnection() resp = conn.send_request(_file_list_query, {}) files = resp['files'] return files
def create_key(username, password): """Creates a new api_key. *username* and *password* should be your PiCloud login information.""" conn = cloud._getcloudnetconnection() resp = conn.send_request(_key_create, {}, auth=(username, password)) return resp['key']
def deactivate_key(username, password, api_key): """Deactivates the specified *api_key*. *username* and *password* should be your PiCloud login information.""" conn = cloud._getcloudnetconnection() resp = conn.send_request(_key_deactivate % api_key, {}, auth=(username, password)) return True
def release(request_id): """Release the realtime core request associated with *request_id*. Request must have been satisfied to terminate.""" try: int(request_id) except ValueError: raise TypeError('release_rt_cores requires a numeric request_id') conn = cloud._getcloudnetconnection() conn.send_request(_release_query, {'rid': str(request_id)})
def _send_request(request_url, data, jsonize_values=True): """Makes a cloud request and returns the results. * request_url: whee the request should be sent * data: dictionary of post values relevant to the request * jsonize_values: if True (default), then the values of the *data* dictionary are jsonized before request is made.""" if jsonize_values: data = _jsonize_values(data) conn = cloud._getcloudnetconnection() return conn.send_request(request_url, data)
def get_key(username, password, api_key): """Returns information including api_secretkey, active status, and note for the specified *api_key*. *username* and *password* should be your PiCloud login information.""" conn = cloud._getcloudnetconnection() resp = conn.send_request(_key_get % api_key, {}, auth=(username, password)) return resp['key']
def list_keys(username, password, active_only=False): """Returns a list of all api keys. If *active_only* is True, only active keys are returned. *username* and *password* should be your PiCloud login information.""" conn = cloud._getcloudnetconnection() resp = conn.send_request(_key_list, {}, get_values={'active_only': active_only}, auth=(username, password)) return resp['api_keys']
def _get_md5(obj_path, prefix=None, log_missing_file_error = True): conn = _getcloudnetconnection() full_obj_path = _get_effective_obj_path(obj_path, prefix) resp = conn.send_request(_bucket_md5_query, {'name': full_obj_path}, log_cloud_excp = log_missing_file_error) md5sum = resp['md5sum'] if '-' in md5sum: # multipart; can't rely on md5 return None return md5sum
def get_key_by_key(api_key, api_secretkey): """ Similar to *get_key*, but access information via api_key credentials (api_key and api_secretkey). """ conn = cloud._getcloudnetconnection() resp = conn.send_request(_key_get % api_key, {}, auth=(api_key, api_secretkey)) return resp['key']
def _putf(f, obj_path, prefix=None, content_type=None, content_encoding=None): """ helper for putf. Accepts arbitrary content_type and content_encoding """ full_obj_path = _get_effective_obj_path(obj_path, prefix) fsize = 0 # file size. may not be computable if isinstance(f, basestring): from cStringIO import StringIO f = StringIO(f) conn = _getcloudnetconnection() try: #raise IOError hex_md5, content_md5, fsize = _compute_md5(f) except IOError: raise IOError('File object is not seekable. Cannot transmit') if fsize > 5000000000: raise ValueError('Cannot store bucket objects larger than 5GB on cloud.bucket') if fsize == 0: raise ValueError('Cannot store empty bucket objects') try: cloudLog.debug('bucket object obj_path in client: %s' % full_obj_path) # get a file ticket resp = conn.send_request(_bucket_new_query, {'name': full_obj_path, 'content-type' : content_type, 'content-encoding' : content_encoding, 'hex-md5' : hex_md5 }) ticket = resp['ticket'] params = resp['params'] url = params['action'] # update ticket ticket['file'] = f if content_md5: ticket['Content-MD5'] = content_md5 resp = _aws_retryable_post(conn, url, ticket) resp.read() finally: f.close()
def putf(f, name): """Similar to put. putf, however, accepts a file object (file, StringIO, etc.) ``f`` instead of a file_path. .. note:: ``f`` is not rewound. f.read() from current position will be placed on PiCloud .. warning:: If the file object does not correspond to an actual file on disk, it will be read entirely into memory before being transferred to PiCloud.""" if '../..' in name: raise ValueError('"../.." cannot be in name') fsize = 0 # file size. may not be computable if isinstance(f, basestring): fsize = len(f) from cStringIO import StringIO f = StringIO(f) else: try: fsize = os.fstat(f.fileno()).st_size except (AttributeError, OSError): pass if fsize > 5000000000: raise ValueError('Cannot store files larger than 5GB on cloud.files') conn = _getcloudnetconnection() try: # get a file ticket resp = conn.send_request(_file_new_query, {'name': name}) ticket = resp['ticket'] params = resp['params'] url = params['action'] # set file in ticket ticket['file'] = f # post file using information in ticket ticket['key'] = str(ticket['key']) resp = _aws_retryable_post(conn, url, ticket) resp.read() finally: f.close()
def is_public(obj_path, prefix=None): """Determine if the PiCloud bucket object ``effective_obj_path`` is publicly accessible by a URL Return public URL if it is; otherwise False """ conn = _getcloudnetconnection() full_obj_path = _get_effective_obj_path(obj_path, prefix) resp = conn.send_request(_bucket_is_public_query, {'name': full_obj_path}) if resp['status']: public_url = S3_URL+resp['url'] return public_url else: return resp['status']
def change_max_duration(request_id, new_max_duration=None): try: int(request_id) except ValueError: raise TypeError('release_rt_cores requires a numeric request_id') if new_max_duration != None: if not isinstance(new_max_duration, (int, long)): raise TypeError('Optional parameter max_duration should be an integer value > 0') if new_max_duration <= 0: raise TypeError('Optional parameter max_duration should be an integer value > 0') conn = cloud._getcloudnetconnection() conn.send_request(_change_max_duration_query, {'rid': str(request_id), 'cap_duration':new_max_duration})
def info(obj_path, prefix=None): """Return information about the PiCloud bucket object ``effective_obj_path`` Information includes size, created time, last modified time, md5sum, public URL (if any), and any headers set with ``make_public`` """ conn = _getcloudnetconnection() full_obj_path = _get_effective_obj_path(obj_path, prefix) resp = conn.send_request(_bucket_info_query, {'name': full_obj_path}) del resp['data'] if 'url' in resp: resp['url'] = S3_URL+resp['url'] return resp
def make_public(obj_path, prefix=None, headers={}, reset_headers = False): """Makes the PiCloud bucket object ``effective_obj_path`` publicly accessible by a URL Returns public URL Additionally, you can control the HTTP headers that will be in the response to a request for the URL with the ``headers`` dictionary. Possible standard HTTP headers are: * content-type * content-encoding * content-disposition * cache-control All other headers are considered custom and will have x-amz-meta- prepended to them. Example: make_public('foo',headers={'content-type': 'text/x-python', 'purpose' : 'basic_script'} might return \https://s3.amazonaws.com/pi-user-buckets/ddasy/foo The headers in the response to a request for \https://s3.amazonaws.com/pi-user-buckets/ddasy/foo will include: * content-type: text/x-python * x-amz-meta-purpose: basic_script Clear all custom headers, other than content-type and content-encoding, by setting ``reset_headers`` to True .. note:: Default content-type and content-encoding are inferred during the original cloud.bucket.put(..) call from the ``file_path`` and ``obj_path``. """ conn = _getcloudnetconnection() full_obj_path = _get_effective_obj_path(obj_path, prefix) post_values = {'name' : full_obj_path, 'reset_headers' : reset_headers} for key, val in headers.items(): try: post_values['bh_' + key] = val.decode('ascii').encode('ascii') except (UnicodeDecodeError, UnicodeEncodeError): raise TypeError('header values must be ASCII strings') resp = conn.send_request(_bucket_make_public_query, post_values) public_url = S3_URL+resp['url'] return public_url
def request(type, cores, max_duration=None): """Request a number of *cores* of a certain compute resource *type* Returns a dictionary describing the newly created realtime request, with the same format as the requests returned by list_rt_cores. If specified, request will terminate after being active for *max_duration* hours """ if max_duration != None: if not isinstance(max_duration, (int, long)): raise TypeError('Optional parameter max_duration should be an integer value > 0') if max_duration <= 0: raise TypeError('Optional parameter max_duration should be an integer value > 0') conn = cloud._getcloudnetconnection() return fix_time_element(conn.send_request(_request_query, {'cores': cores, 'type' : type, 'cap_duration': max_duration if max_duration else 0}), 'start_time')
def list(request_id=""): """Returns a list of dictionaries describing realtime core requests. If *request_id* is specified, only show realtime core request with that request_id The keys within each returned dictionary are: * request_id: numeric ID associated with the request * type: Type of computation resource this request grants * cores: Number of (type) cores this request grants * start_time: Time when real time request was satisfied; None if still pending""" if request_id != "": try: int(request_id) except ValueError: raise TypeError('Optional parameter to list_rt_cores must be a numeric request_id') conn = cloud._getcloudnetconnection() rt_list = conn.send_request(_list_query, {'rid': str(request_id)}) return [fix_time_element(rt,'start_time') for rt in rt_list['requests']]
def getf(name, start_byte=0, end_byte=None): """ Retrieves the file named by ``name`` from PiCloud. Return value is a CloudFile (file-like object) that can be read() to retrieve the file's contents A range can be specified through *start_byte* and *end_byte*, where only the data between those two offsets will be accessable in the CloudFile. If start_byte is set, the returned CloudFile.tell() will be start_byte An end_byte of None or exceeding file size is interpretted as end of file """ conn = _getcloudnetconnection() resp = conn.send_request(_file_get_query, {'name': name}) ticket = resp['ticket'] params = resp['params'] file_size = params['size'] if not start_byte: start_byte=0 if file_size and (not end_byte or end_byte > file_size): end_byte = file_size if not isinstance(start_byte, (int, long)): raise TypeError('start_byte must be an integer') if end_byte and not isinstance(end_byte, (int, long)): raise TypeError('end_byte must be an integer') if end_byte: ticket['Range'] = 'bytes=%s-%s' % tuple( [start_byte, end_byte] ) resp = _aws_retryable_post(conn, params['action'], None, ticket) cloud_file = CloudFile( resp, file_size, start_byte, end_byte ) return cloud_file
def remove(obj_paths, prefix=None): """Removes object(s) named ``effective_obj_paths`` from PiCloud bucket obj_paths can be a single object or a list of objects """ conn = _getcloudnetconnection() if not hasattr(obj_paths, '__iter__'): obj_paths = [obj_paths] obj_paths_iter = obj_paths.__iter__() removed = False while True: paths_to_remove = builtin_list(islice(obj_paths_iter, 1000)) if not paths_to_remove: break full_obj_paths =[ _get_effective_obj_path(obj_path, prefix) for obj_path in paths_to_remove] resp = conn.send_request(_bucket_remove_query, {'name': full_obj_paths}) removed = resp['removed'] return removed
def map(name, mapper, chunk_size=None, record_reader=None, combiner=None, reducer=None, **kwargs): """ With map, you can process a file stored in cloud.files in parallel. The parallelism is achieved by dividing the file specified by *name* into chunks of size *chunk_size* (bytes). Each chunk is assigned a sub job. The sub job in turn processes just that chunk, allowing for the entire file to be processed by as many cores in parallel as there are chunks. We will call this type of sub job a "mapper sub job". If chunk_size is None, it will be automatically set to 1/10th of the size of the file. Map will return a single job IDentifier (jid). The sub jobs that comprise it do not have identifiers and, therefore, cannot be accessed directly. cloud.info(jid), however, will show you information for relevant sub jobs. By default, each chunk is split into records (of 0 or more characters) using newlines as delimiters. If *record_reader* is specified as a string, each chunk is split into records using that as the delimiter. In the event a record spans across two chunks, it is guaranteed that a mapper will only be called once on the full record. In other words, we've made sure it works correctly. *mapper* is a function that takes a single argument, a record, and should return an iterable of values (a generator). In the simplest case, it can return a generator that yields only one value. Example:: def mapper(record): yield record When no *combiner* or *reducer* is specified, the return value of the cloud.files.map job will be roughly equivalent to:: map(mapper, record_reader(file_contents)) A *reducer* is a function that takes in an iterable of values and returns an iterable of values. The iterable parameter iterates through all the values returned by all the mapper(record) calls. When the reducer is specified, *reducer* will result in the creation of one additional sub job. The reducer sub job grabs the results of each mapper sub job (iterators), combines them into a single iterator, and then passes that iterator into your *reducer* function. The return value of the cloud.files.map job will be the iterator returned by the *reducer*. A *combiner*, like a *reducer*, takes in an iterable of values and returns an iterable of values. The difference is that the *combiner* is run in each mapper sub job, and each one only takes in values that were produced from the associated chunk. If a *reducer* is also specified, then the reducer sub job grabs the results of each *combiner* run in each mapper sub job. Example for counting the number of words in a document:: def wordcount_mapper(record): yield len(record.split(' ')) def wordcount_reducer(wordcounts): yield sum(wordcounts) jid = cloud.files.map('example_document', wordcount_mapper, reducer=wordcount_reducer) Result:: cloud.result(jid) >> [# of words] For advanced users, *record_reader* can also be specified as a function that takes in a file-like object (has methods read(), tell(), and seek()), and the end_byte for the current chunk. The *record_reader* should return an iterable of records. See default_record_reader for an example. Additional information exists on our blog and online documentation. Reserved special *kwargs* (see docs for details): * _cores: Set number of cores your job will utilize. See http://blog.picloud.com/2012/08/31/introducing-multicore-support/ In addition to having access to more cores, you will have _cores*RAM[_type] where _type is the _type you select Possible values depend on what _type you choose: 'c1': 1 'c2': 1, 2, 4, 8 'f2': 1, 2, 4, 8, 16 'm1': 1, 2 's1': 1 * _depends_on: An iterable of jids that represents all jobs that must complete successfully before any jobs created by this map function may be run. * _depends_on_errors: A string specifying how an error with a jid listed in _depends_on should be handled. 'abort': Set this job to 'stalled' (Default) 'ignore': Treat an error as satisfying the dependency * _env: A string specifying a custom environment you wish to run your jobs within. See environments overview at http://blog.picloud.com/2011/09/26/introducing-environments-run-anything-on-picloud/ * _fast_serialization: This keyword can be used to speed up serialization, at the cost of some functionality. This affects the serialization of both the map arguments and return values The map function will always be serialized by the enhanced serializer, with debugging features. Possible values keyword are: 0. default -- use cloud module's enhanced serialization and debugging info 1. no debug -- Disable all debugging features for arguments 2. use cPickle -- Use python's fast serializer, possibly causing PicklingErrors * _kill_process: Terminate the Python interpreter *func* runs in after *func* completes, preventing the interpreter from being used by subsequent jobs. See Technical Overview for more info. * _label: A user-defined string label that is attached to the created jobs. Labels can be used to filter when viewing jobs interactively (i.e. on the PiCloud website). * _max_runtime: Specify the maximum amount of time (in integer minutes) a job can run. If job runs beyond this time, it will be killed. * _priority: A positive integer denoting the job's priority. PiCloud tries to run jobs with lower priority numbers before jobs with higher priority numbers. * _profile: Set this to True to enable profiling of your code. Profiling information is valuable for debugging, but may slow down your job. * _restartable: In the very rare event of hardware failure, this flag indicates that the job can be restarted if the failure happened in the middle of the job. By default, this is true. This should be unset if the job has external state (e.g. it modifies a database entry) * _type: Select the type of compute resources to use. PiCloud supports four types, specified as strings: 'c1' 1 compute unit, 300 MB ram, low I/O (default) 'c2' 2.5 compute units, 800 MB ram, medium I/O 'm1' 3.25 compute units, 8 GB ram, high I/O 's1' variable compute units (2 cu max), 300 MB ram, low I/O, 1 IP per core See http://www.picloud.com/pricing/ for pricing information """ cloud_obj = _getcloud() params = cloud_obj._getJobParameters(mapper, kwargs) # takes care of kwargs file_details = _file_info(name) if not file_details['exists']: raise ValueError('file does not exist on the cloud, or is not yet ready to be accessed') file_size = int( file_details['size'] ) params['file_name'] = name # chunk_size if chunk_size: if chunk_size==0: raise Exception('the chunk_size should be a non zero integer value') if not isinstance(chunk_size, (int, long)): raise Exception('the chunk_size should be a non zero integer value') params['chunk_size'] = chunk_size # mapper _validate_arguments(mapper, 'mapper') # record_reader if not record_reader: record_reader = default_record_reader('\n') else: if isinstance(record_reader, basestring): record_reader = default_record_reader(record_reader) else: _validate_rr_arguments(record_reader, 'record_reader') # combiner if not combiner: def combiner(it): for x in it: yield x else: _validate_arguments(combiner, 'combiner') func_to_be_sent = _mapper_combiner_wrapper(mapper, name, file_size, record_reader, combiner) sfunc, sarg, logprefix, logcnt = cloud_obj.adapter.cloud_serialize( func_to_be_sent, params['fast_serialization'], [], logprefix='mapreduce.' ) data = Packer() data.add(sfunc) params['data'] = data.finish() # validate reducer & serialize reducer if reducer: _validate_arguments(reducer, 'reducer') reducer = _reducer_wrapper(reducer) s_reducer, red_sarg, red_logprefix, red_logcnt = cloud_obj.adapter.cloud_serialize( reducer, params['fast_serialization'], [], logprefix='mapreduce.reducer.' ) data_red = Packer() data_red.add(s_reducer) params['data_red'] = data_red.finish() conn = _getcloudnetconnection() conn._update_params(params) cloud_obj.adapter.dep_snapshot() resp = conn.send_request(_filemap_job_query, params) return resp['jids']
def sync(source, dest, delete=False): """Syncs data between a cloud volumes and the local filesystem. Either *source* or *dest* should specify a cloud volume path, but not both. A cloud volume path is of the format: volume_name:[path-within-volume] where path-within-volume cannot be an absolute path (There is no concept of the root of the filesystem in a volume: All path specifications are relative to the top level of the volume). Note that the colon is what indicates this is a volume path specification. Local paths should point to a local directory or file. If the local path is a directory, whether the directory itself or the contents of the directory are synced depends on the presence of a trailing slash. A trailing slash indicates that the contents should be synced, while its absence would lead to the directory itself being synced to the volume. *source* can be a list of paths, all of which should either be local paths, or volume paths in the same cloud volume. Example:: sync('~/dataset1', 'myvolume1:') will ensure that a directory named 'dataset1' will exist at the top level of the cloud volume 'myvolume1', that contains all the contents of 'dataset1'. On the other hand, sync('~/dataset1/', 'myvolume1:') will copy all the contents of 'dataset1' to the top level of 'myvolume1'. This behavior mirrors the file-copying tool 'rsync'. If *delete* is True, files that exist in *dest* but not in *source* will be deleted. By default, such files will not be removed. """ conn = cloud._getcloudnetconnection() retry_attempts = conn.retry_attempts dest_is_local = common.is_local_path(dest) l_paths, r_paths = (dest, source) if dest_is_local else (source, dest) local_paths = common.parse_local_paths(l_paths) vol_name, vol_paths = common.parse_remote_paths(r_paths) for vol_path in vol_paths: if os.path.isabs(vol_path): raise cloud.CloudException('Volume path cannot be absolute') # acquire syncslot and syncserver info to complete the real remote paths success = release = False exit_code = -1 syncserver, syncslot = _acquire_syncslot(vol_name) try: cloudLog.debug('Acquired syncslot %s on server %s', syncslot, syncserver) r_base = '%s@%s:volume/' % (syncslot, syncserver) r_paths = ' '.join(['%s%s' % (r_base, v_path) for v_path in vol_paths]) l_paths = ' '.join(local_paths) sync_args = (r_paths, l_paths) if dest_is_local else (l_paths, r_paths) for attempt in xrange(retry_attempts): exit_code, stdout, stderr = common.rsync_session(*sync_args, delete=delete) if not exit_code: break cloudLog.error('sync attempt failed:\n%s', stderr) print_stdout(str(stderr)) print_stdout('Retrying volume sync...') else: raise Exception('sync failed multiple attempts... ' 'Please contact PiCloud support') except KeyboardInterrupt: cloudLog.error('Sync interrupted by keyboard') print 'Sync interrupted by keyboard' except Exception as e: cloudLog.error('Sync errored with:\n%s', e) print e finally: print_stdout('Cleanup...') success = not exit_code release = success and not dest_is_local _send_vol_request('sync_terminate', {'name': vol_name, 'syncslot': syncslot, 'syncserver': syncserver, 'release': release}) if release: print_stdout('Ensuring redundancy...') _wait_for_release(vol_name) if success: print_stdout('Sync successfully completed.') else: raise cloud.CloudException('Volume sync failed with error code %s. ' 'See cloud.log' % exit_code)