Beispiel #1
0
def list(prefix=None, folderize=False, marker=None, max_keys=1000):
    """
    Retrieve obj_paths of all objects stored on PiCloud. Returns a list of keys in 
    lexicographic order. 
        
    * ``prefix``: Return only keys beginning with prefix. 
    * ``folderize``: Treat listing as directory based; compact keys containing "/" into a single folder
        (a key is a folder if and only if it ends in "/")  
        A folder can then be inspected by setting prefix equal to the folder name
    * marker: Return only keys with where key > marker
    * max_keys: Maximum number of keys that can be returned (max 1000). Fewer may be returned
     
    The list will have an attribute, *truncated*, that indicates if the listing is truncated.
    To see the next results, make a subsequent list query with marker set to list[-1]
    
    Use *iterlist* to avoid truncation 
    """
    
    conn = _getcloudnetconnection()
    
    if max_keys > 1000:
        max_keys = 1000

    resp = conn.send_request(_bucket_list_query, {'prefix': prefix,
                                                  'delimiter': '/' if folderize else None,
                                                  'marker': marker,
                                                  'max_keys': max_keys})
    
    files = TruncatableList(resp['files'])
    truncated = resp['truncated']

    files.truncated = truncated
    return files
Beispiel #2
0
def get_md5(name, log_missing_file_error = True):
    """Return the md5 checksum of the file named ``name`` stored on PiCloud"""
    conn = _getcloudnetconnection()
    resp = conn.send_request(_file_md5_query, {'name': name},
                             log_cloud_excp = log_missing_file_error)
    md5sum = resp['md5sum']
    return md5sum
Beispiel #3
0
def exists(name):
    """Check if a file named ``name`` is stored on PiCloud."""
    conn = _getcloudnetconnection()
        
    resp = conn.send_request(_file_exists_query, {'name': name})
    exists = resp['exists']
    return exists
Beispiel #4
0
def delete(name):
    """Deletes the file named ``name`` from PiCloud."""
    conn = _getcloudnetconnection()

    resp = conn.send_request(_file_delete_query, {'name': name})
    deleted = resp['deleted']
    return deleted
Beispiel #5
0
def getf(obj_path, prefix=None, start_byte=0, end_byte=None):
    """
    Retrieve the object referenced by ``effective_obj_path`` from PiCloud.
    Return value is a CloudBucketObject (file-like object) that can be read() to 
    retrieve the object's contents 

    An optional byte_range can be specified using ``start_byte`` and ``end_byte``, 
    where only the data between ``start_byte`` and ``end_byte`` is returned and made 
    accessible to the CloudBucketObject.  The returned CloudBucketObject.tell() will 
    initialized to ``start_byte``.
    
    An ``end_byte`` of None or exceeding file size is interpreted as a request to retrieve to end of file.
    """    
    
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    conn = _getcloudnetconnection()

    resp = conn.send_request(_bucket_get_query, {'name': full_obj_path})
    
    ticket = resp['ticket']
    params = resp['params']
    file_size = params['size']
    
    if not start_byte:
        start_byte = 0
        
    if file_size and (not end_byte or end_byte > file_size):
        end_byte = file_size

    cloud_file = CloudBucketObject( params['action'], ticket, file_size, start_byte, end_byte )
    
    return cloud_file
Beispiel #6
0
 def __connect(self):
     """Connect to S3"""            
     if self.__http_response:
         self.__http_response.close()
     self.__ticket['Range'] = 'bytes=%s-%s' % tuple(  [self.__pos, self.__end_byte]  )        
     conn = _getcloudnetconnection()
     self.__http_response =  _aws_retryable_post(conn, self.__action, None, self.__ticket)
Beispiel #7
0
def _file_info(name):
    """
    get information about name
    """
    conn = _getcloudnetconnection()
    
    resp = conn.send_request(_file_exists_query, {'name':name})
    return resp
Beispiel #8
0
def public_url_folder():
    """Return HTTP path that begins all your public bucket URLs.
    e.g. object 'foo' (if is_public) will be found at
        public_url_folder() + foo
    """
    conn = _getcloudnetconnection()
    resp = conn.send_request(_bucket_public_url_folder_query, {})
    return S3_URL+resp['url']
Beispiel #9
0
def exists(obj_path, prefix=None):
    """Return boolean indicating if PiCloud bucket object named ``effective_obj_path`` exists"""
    conn = _getcloudnetconnection()
        
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    resp = conn.send_request(_bucket_exists_query, {'name': full_obj_path})
    exists = resp['exists']
    return exists
Beispiel #10
0
def list():
    """List all files stored on PiCloud."""
    
    conn = _getcloudnetconnection()

    resp = conn.send_request(_file_list_query, {})
    files = resp['files']

    return files
Beispiel #11
0
def create_key(username, password):
    """Creates a new api_key. *username* and *password*
    should be your PiCloud login information."""
    
    conn = cloud._getcloudnetconnection()
    resp = conn.send_request(_key_create,
                             {},
                             auth=(username, password))
    
    return resp['key']
Beispiel #12
0
def deactivate_key(username, password, api_key):
    """Deactivates the specified *api_key*. *username* and *password*
    should be your PiCloud login information."""
    
    conn = cloud._getcloudnetconnection()
    resp = conn.send_request(_key_deactivate % api_key,
                             {},
                             auth=(username, password))
    
    return True
Beispiel #13
0
def release(request_id):
    """Release the realtime core request associated with *request_id*. 
    Request must have been satisfied to terminate."""
    
    try:
        int(request_id)
    except ValueError:
        raise TypeError('release_rt_cores requires a numeric request_id')
    
    conn = cloud._getcloudnetconnection()
    conn.send_request(_release_query, {'rid': str(request_id)})
Beispiel #14
0
def _send_request(request_url, data, jsonize_values=True):
    """Makes a cloud request and returns the results.
    
    * request_url: whee the request should be sent
    * data: dictionary of post values relevant to the request
    * jsonize_values: if True (default), then the values of the *data*
        dictionary are jsonized before request is made."""
    if jsonize_values:
        data = _jsonize_values(data)
    conn = cloud._getcloudnetconnection()
    return conn.send_request(request_url, data)
Beispiel #15
0
def get_key(username, password, api_key):
    """Returns information including api_secretkey, active status, and
    note for the specified *api_key*. *username* and *password* should
    be your PiCloud login information."""
    
    conn = cloud._getcloudnetconnection()
    resp = conn.send_request(_key_get % api_key,
                             {},
                             auth=(username, password))
    
    return resp['key']
Beispiel #16
0
def list_keys(username, password, active_only=False):
    """Returns a list of all api keys. If *active_only* is True, only
    active keys are returned. *username* and *password* should be your
    PiCloud login information."""
    
    conn = cloud._getcloudnetconnection()
    resp = conn.send_request(_key_list,
                             {},
                             get_values={'active_only': active_only},
                             auth=(username, password))
    
    return resp['api_keys']
Beispiel #17
0
def _get_md5(obj_path, prefix=None, log_missing_file_error = True):
    conn = _getcloudnetconnection()
    
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    resp = conn.send_request(_bucket_md5_query, {'name': full_obj_path},
                             log_cloud_excp = log_missing_file_error)
    md5sum = resp['md5sum']
    
    if '-' in md5sum: # multipart; can't rely on md5
        return None
    
    return md5sum
Beispiel #18
0
def get_key_by_key(api_key, api_secretkey):
    """
    Similar to *get_key*, but access information via api_key credentials
    (api_key and api_secretkey).
    """
    
    conn = cloud._getcloudnetconnection()
    resp = conn.send_request(_key_get % api_key,
                             {},
                             auth=(api_key, api_secretkey))
    
    return resp['key']
Beispiel #19
0
def _putf(f, obj_path, prefix=None, content_type=None, content_encoding=None):
    """
    helper for putf.
    Accepts arbitrary content_type and content_encoding
    """
    
            
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    
    fsize = 0 # file size. may not be computable 
    
    if isinstance(f, basestring):
        from cStringIO import StringIO        
        f = StringIO(f)
    
    conn = _getcloudnetconnection()         
    
    try:
        #raise IOError
        hex_md5, content_md5, fsize = _compute_md5(f) 
        
    except IOError:  
        raise IOError('File object is not seekable. Cannot transmit')
    
    if fsize > 5000000000:
        raise ValueError('Cannot store bucket objects larger than 5GB on cloud.bucket')
    
    if fsize == 0:
        raise ValueError('Cannot store empty bucket objects')
    
    try:
        cloudLog.debug('bucket object obj_path in client: %s' % full_obj_path)
        # get a file ticket
        resp = conn.send_request(_bucket_new_query, {'name': full_obj_path,
                                                     'content-type' : content_type,
                                                     'content-encoding' : content_encoding,
                                                     'hex-md5' : hex_md5
                                                     })
        ticket = resp['ticket']
        params = resp['params']
        
        url = params['action']
        
        # update ticket
        ticket['file'] = f
        if content_md5:
            ticket['Content-MD5'] = content_md5
                
        resp =  _aws_retryable_post(conn, url, ticket)
        resp.read()
        
    finally:
        f.close()
Beispiel #20
0
def putf(f, name):
    """Similar to put.
    putf, however, accepts a file object (file, StringIO, etc.) ``f`` instead of a file_path.
    
    .. note::
        
        ``f`` is not rewound. f.read() from current position will be placed on PiCloud
    
    .. warning:: 
    
        If the file object does not correspond to an actual file on disk,
        it will be read entirely into memory before being transferred to PiCloud."""
    
    if '../..' in name:
        raise ValueError('"../.." cannot be in name')
    
    fsize = 0 # file size. may not be computable 
    
    if isinstance(f, basestring):
        fsize = len(f)                        
        from cStringIO import StringIO        
        f = StringIO(f)
    else:
        try:
            fsize = os.fstat(f.fileno()).st_size
        except (AttributeError, OSError):
            pass
    
    if fsize > 5000000000:
        raise ValueError('Cannot store files larger than 5GB on cloud.files')
    
    conn = _getcloudnetconnection()         
    
    try:
        # get a file ticket
        resp = conn.send_request(_file_new_query, {'name': name})
        ticket = resp['ticket']
        params = resp['params']
        
        url = params['action']
        
        # set file in ticket
        ticket['file'] = f
        
        # post file using information in ticket
        ticket['key'] = str(ticket['key'])
        resp =  _aws_retryable_post(conn, url, ticket)
        resp.read()
        
    finally:
        f.close()
Beispiel #21
0
def is_public(obj_path, prefix=None):
    """Determine if the PiCloud bucket object ``effective_obj_path`` 
    is publicly accessible by a URL
    
    Return public URL if it is; otherwise False    
    """
    conn = _getcloudnetconnection()
    
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    resp = conn.send_request(_bucket_is_public_query, {'name': full_obj_path})
    if resp['status']:
        public_url = S3_URL+resp['url']
        return public_url
    else:
        return resp['status']
Beispiel #22
0
def change_max_duration(request_id, new_max_duration=None):

    try:
        int(request_id)
    except ValueError:
        raise TypeError('release_rt_cores requires a numeric request_id')
    
    if new_max_duration != None:
        if not isinstance(new_max_duration, (int, long)):
            raise TypeError('Optional parameter max_duration should be an integer value > 0')
        if new_max_duration <= 0:
            raise TypeError('Optional parameter max_duration should be an integer value > 0')
    
    conn = cloud._getcloudnetconnection()
    
    conn.send_request(_change_max_duration_query, {'rid': str(request_id), 'cap_duration':new_max_duration})
Beispiel #23
0
def info(obj_path, prefix=None):
    """Return information about the PiCloud bucket object ``effective_obj_path``
    
    Information includes size, created time, last modified time, md5sum, public URL (if any), 
    and any headers set with ``make_public``      
    """
     
    conn = _getcloudnetconnection()
        
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    resp = conn.send_request(_bucket_info_query, {'name': full_obj_path})
    del resp['data']
    if 'url' in resp:
        resp['url'] = S3_URL+resp['url']
    
    return resp
Beispiel #24
0
def make_public(obj_path, prefix=None, headers={}, reset_headers = False):
    """Makes the PiCloud bucket object ``effective_obj_path`` publicly accessible by a URL
    Returns public URL
    
    Additionally, you can control the HTTP headers that will be in the response to a request 
    for the URL with the ``headers`` dictionary.
    
    Possible standard HTTP headers are:
    
    * content-type
    * content-encoding  
    * content-disposition
    * cache-control    
    All other headers are considered custom and will have x-amz-meta- prepended to them.
    
    Example:
    make_public('foo',headers={'content-type': 'text/x-python', 'purpose' : 'basic_script'}
    might return \https://s3.amazonaws.com/pi-user-buckets/ddasy/foo
    
    The headers in the response to a request for \https://s3.amazonaws.com/pi-user-buckets/ddasy/foo 
    will include: 
    
    * content-type: text/x-python
    * x-amz-meta-purpose: basic_script
    
    Clear all custom headers, other than content-type and content-encoding, by 
    setting ``reset_headers`` to True 
    
    .. note:: Default content-type and content-encoding are inferred during the original 
        cloud.bucket.put(..) call from the ``file_path`` and ``obj_path``.     
    """
    conn = _getcloudnetconnection()
    
    full_obj_path = _get_effective_obj_path(obj_path, prefix)
    post_values = {'name' : full_obj_path,
                   'reset_headers' : reset_headers}
    for key, val in headers.items():
        try:
            post_values['bh_' + key] = val.decode('ascii').encode('ascii')
        except (UnicodeDecodeError, UnicodeEncodeError):
            raise TypeError('header values must be ASCII strings')                    
    
    resp = conn.send_request(_bucket_make_public_query, post_values)
    public_url = S3_URL+resp['url']
    return public_url
Beispiel #25
0
def request(type, cores, max_duration=None):
    """Request a number of *cores* of a certain compute resource *type*  
    Returns a dictionary describing the newly created realtime request, with the same format
    as the requests returned by list_rt_cores.
    If specified, request will terminate after being active for *max_duration* hours
    """
    
    if max_duration != None:
        if not isinstance(max_duration, (int, long)):
            raise TypeError('Optional parameter max_duration should be an integer value > 0')
        if max_duration <= 0:
            raise TypeError('Optional parameter max_duration should be an integer value > 0')
    
    conn = cloud._getcloudnetconnection()
    return fix_time_element(conn.send_request(_request_query, 
                                               {'cores': cores,
                                                'type' : type,
                                                'cap_duration': max_duration if max_duration else 0}), 
                             'start_time')
Beispiel #26
0
def list(request_id=""):
    """Returns a list of dictionaries describing realtime core requests.
    If *request_id* is specified, only show realtime core request with that request_id
    
    The keys within each returned dictionary are:
    
    * request_id: numeric ID associated with the request 
    * type: Type of computation resource this request grants
    * cores: Number of (type) cores this request grants
    * start_time: Time when real time request was satisfied; None if still pending"""
    
    if request_id != "":
        try:
            int(request_id)
        except ValueError:
            raise TypeError('Optional parameter to list_rt_cores must be a numeric request_id')
    
    conn = cloud._getcloudnetconnection()
    rt_list = conn.send_request(_list_query, {'rid': str(request_id)})
    return [fix_time_element(rt,'start_time') for rt in rt_list['requests']]
Beispiel #27
0
def getf(name, start_byte=0, end_byte=None):
    """
    Retrieves the file named by ``name`` from PiCloud.
    Return value is a CloudFile (file-like object) that can be read() to retrieve the file's contents 

    A range can be specified through *start_byte* and *end_byte*, where only the data between those two offsets
    will be accessable in the CloudFile.  If start_byte is set, the returned CloudFile.tell() will be start_byte
    
    An end_byte of None or exceeding file size is interpretted as end of file
    """    
    
    conn = _getcloudnetconnection()

    resp = conn.send_request(_file_get_query, {'name': name})
    
    ticket = resp['ticket']
    params = resp['params']
    file_size = params['size']
    
    if not start_byte:
        start_byte=0
    if file_size and (not end_byte or end_byte > file_size):
        end_byte = file_size

    if not isinstance(start_byte, (int, long)):
        raise TypeError('start_byte must be an integer')
    
    if end_byte and not isinstance(end_byte, (int, long)):
        raise TypeError('end_byte must be an integer')

    if end_byte:
        ticket['Range'] = 'bytes=%s-%s' % tuple(  [start_byte, end_byte]  )

    resp =  _aws_retryable_post(conn, params['action'], None, ticket)
    
    cloud_file = CloudFile( resp, file_size, start_byte, end_byte )
    
    return cloud_file
Beispiel #28
0
def remove(obj_paths, prefix=None):
    """Removes object(s) named ``effective_obj_paths`` from PiCloud bucket
    
    obj_paths can be a single object or a list of objects
    """
    conn = _getcloudnetconnection()
    
    if not hasattr(obj_paths, '__iter__'):
        obj_paths = [obj_paths]
        
    obj_paths_iter = obj_paths.__iter__()
    
    removed = False
    while True:
        paths_to_remove = builtin_list(islice(obj_paths_iter, 1000))
        if not paths_to_remove:
            break

        full_obj_paths =[ _get_effective_obj_path(obj_path, prefix) for obj_path in paths_to_remove]
        resp = conn.send_request(_bucket_remove_query, {'name': full_obj_paths})
        removed = resp['removed']
    
    return removed
Beispiel #29
0
def map(name, mapper, chunk_size=None, record_reader=None, combiner=None, reducer=None, **kwargs):
    """
    With map, you can process a file stored in cloud.files in parallel. The
    parallelism is achieved by dividing the file specified by *name* into
    chunks of size *chunk_size* (bytes). Each chunk is assigned a sub job. The
    sub job in turn processes just that chunk, allowing for the entire file to
    be processed by as many cores in parallel as there are chunks. We will call
    this type of sub job a "mapper sub job".
    
    If chunk_size is None, it will be automatically set to 1/10th of the size
    of the file.
    
    Map will return a single job IDentifier (jid). The sub jobs that comprise it
    do not have identifiers and, therefore, cannot be accessed directly.
    cloud.info(jid), however, will show you information for relevant sub jobs.
    
    By default, each chunk is split into records (of 0 or more characters) using
    newlines as delimiters. If *record_reader* is specified as a string, each
    chunk is split into records using that as the delimiter.
    
    In the event a record spans across two chunks, it is guaranteed that a mapper
    will only be called once on the full record. In other words, we've made sure
    it works correctly.
    
    *mapper* is a function that takes a single argument, a record, and should
    return an iterable of values (a generator). In the simplest case, it can
    return a generator that yields only one value.
    
    Example::
    
        def mapper(record):
            yield record
    
    When no *combiner* or *reducer* is specified, the return value of the
    cloud.files.map job will be roughly equivalent to::
            
            map(mapper, record_reader(file_contents))
    
    A *reducer* is a function that takes in an iterable of values and returns an 
    iterable of values.  The iterable parameter iterates through all the values 
    returned by all the mapper(record) calls. When the reducer is specified,
    *reducer* will result in the creation of one additional sub job. The reducer
    sub job grabs the results of each mapper sub job (iterators), combines them
    into a single iterator, and then passes that iterator into your *reducer*
    function. The return value of the cloud.files.map job will be the iterator
    returned by the *reducer*.
    
    A *combiner*, like a *reducer*, takes in an iterable of values and returns an
    iterable of values. The difference is that the *combiner* is run in each
    mapper sub job, and each one only takes in values that were produced from the
    associated chunk. If a *reducer* is also specified, then the reducer sub job
    grabs the results of each *combiner* run in each mapper sub job.
    
    Example for counting the number of words in a document::
    
        def wordcount_mapper(record):
            yield len(record.split(' '))
            
        def wordcount_reducer(wordcounts):
            yield sum(wordcounts)
            
        jid = cloud.files.map('example_document', wordcount_mapper, reducer=wordcount_reducer)
        
    Result::
        cloud.result(jid)
            >> [# of words]
    
    For advanced users, *record_reader* can also be specified as a function that
    takes in a file-like object (has methods read(), tell(), and seek()), and
    the end_byte for the current chunk. The *record_reader* should return an
    iterable of records.  See default_record_reader for an example.
    
    Additional information exists on our blog and online documentation.
        
        Reserved special *kwargs* (see docs for details):
        
        * _cores:
            Set number of cores your job will utilize. See http://blog.picloud.com/2012/08/31/introducing-multicore-support/
            In addition to having access to more cores, you will have _cores*RAM[_type] where _type is the _type you select
            Possible values depend on what _type you choose:
            
            'c1': 1
            'c2': 1, 2, 4, 8
            'f2': 1, 2, 4, 8, 16
            'm1': 1, 2
            's1': 1        
        * _depends_on:
            An iterable of jids that represents all jobs that must complete successfully 
            before any jobs created by this map function may be run.
        * _depends_on_errors:
            A string specifying how an error with a jid listed in _depends_on should be handled.
            'abort': Set this job to 'stalled'  (Default)
            'ignore': Treat an error as satisfying the dependency            
        * _env:
            A string specifying a custom environment you wish to run your jobs within.
            See environments overview at 
            http://blog.picloud.com/2011/09/26/introducing-environments-run-anything-on-picloud/
        * _fast_serialization:
            This keyword can be used to speed up serialization, at the cost of some functionality.
            This affects the serialization of both the map arguments and return values
            The map function will always be serialized by the enhanced serializer, with debugging features.
            Possible values keyword are:
                        
            0. default -- use cloud module's enhanced serialization and debugging info            
            1. no debug -- Disable all debugging features for arguments            
            2. use cPickle -- Use python's fast serializer, possibly causing PicklingErrors                
        * _kill_process:
                Terminate the Python interpreter *func* runs in after *func* completes, preventing
                the interpreter from being used by subsequent jobs.  See Technical Overview for more info.                            
        * _label: 
            A user-defined string label that is attached to the created jobs. 
            Labels can be used to filter when viewing jobs interactively (i.e.
            on the PiCloud website).        
        * _max_runtime:
            Specify the maximum amount of time (in integer minutes) a job can run. If job runs beyond 
            this time, it will be killed.                     
        * _priority: 
                A positive integer denoting the job's priority. PiCloud tries to run jobs 
                with lower priority numbers before jobs with higher priority numbers.            
        * _profile:
                Set this to True to enable profiling of your code. Profiling information is 
                valuable for debugging, but may slow down your job.
        * _restartable:
                In the very rare event of hardware failure, this flag indicates that the job
                can be restarted if the failure happened in the middle of the job.
                By default, this is true. This should be unset if the job has external state
                (e.g. it modifies a database entry)
        * _type:
                Select the type of compute resources to use.  PiCloud supports four types,
                specified as strings:
                
                'c1'
                    1 compute unit, 300 MB ram, low I/O (default)                    
                'c2'
                    2.5 compute units, 800 MB ram, medium I/O                    
                'm1'                    
                    3.25 compute units, 8 GB ram, high I/O
                's1'
                    variable compute units (2 cu max), 300 MB ram, low I/O, 1 IP per core                    
                                    
                See http://www.picloud.com/pricing/ for pricing information
    """
    
    cloud_obj = _getcloud()
    params = cloud_obj._getJobParameters(mapper, kwargs)    # takes care of kwargs
    
    file_details = _file_info(name)
    if not file_details['exists']:
        raise ValueError('file does not exist on the cloud, or is not yet ready to be accessed')
    file_size = int( file_details['size'] )
    params['file_name'] = name
    
    
    # chunk_size
    if chunk_size:
        if chunk_size==0:
            raise Exception('the chunk_size should be a non zero integer value')
        if not isinstance(chunk_size, (int, long)):
            raise Exception('the chunk_size should be a non zero integer value')        
        params['chunk_size'] = chunk_size
            
    
    # mapper
    _validate_arguments(mapper, 'mapper')
    
    # record_reader
    if not record_reader:
        record_reader = default_record_reader('\n')
    else:
        if isinstance(record_reader, basestring):
            record_reader = default_record_reader(record_reader)
        else:
            _validate_rr_arguments(record_reader, 'record_reader')
    
    # combiner
    if not combiner:
        def combiner(it):
            for x in it:
                yield x
    else:
        _validate_arguments(combiner, 'combiner')


    func_to_be_sent = _mapper_combiner_wrapper(mapper, name, file_size, record_reader, combiner)
    
    sfunc, sarg, logprefix, logcnt = cloud_obj.adapter.cloud_serialize( func_to_be_sent, 
                                                                    params['fast_serialization'], 
                                                                    [], 
                                                                    logprefix='mapreduce.' )
    
    data = Packer()
    data.add(sfunc)
    params['data'] = data.finish()
    
    # validate reducer & serialize reducer
    if reducer:
        _validate_arguments(reducer, 'reducer')
        reducer = _reducer_wrapper(reducer)
        s_reducer, red_sarg, red_logprefix, red_logcnt = cloud_obj.adapter.cloud_serialize( reducer, params['fast_serialization'], [], logprefix='mapreduce.reducer.' )
        data_red = Packer()
        data_red.add(s_reducer)
        params['data_red'] = data_red.finish()
        
    conn = _getcloudnetconnection()
    conn._update_params(params)
    cloud_obj.adapter.dep_snapshot()
    
    resp = conn.send_request(_filemap_job_query, params)
    
    return resp['jids']
Beispiel #30
0
def sync(source, dest, delete=False):
    """Syncs data between a cloud volumes and the local filesystem.

    Either *source* or *dest* should specify a cloud volume path, but not both.
    A cloud volume path is of the format:

        volume_name:[path-within-volume]

    where path-within-volume cannot be an absolute path (There is no concept of
    the root of the filesystem in a volume: All path specifications are relative
    to the top level of the volume).  Note that the colon is what indicates this
    is a volume path specification.  Local paths should point to a local
    directory or file.  If the local path is a directory, whether the
    directory itself or the contents of the directory are synced depends on the
    presence of a trailing slash.  A trailing slash indicates that the contents
    should be synced, while its absence would lead to the directory itself being
    synced to the volume.  *source* can be a list of paths, all of which should
    either be local paths, or volume paths in the same cloud volume.

    Example::

        sync('~/dataset1', 'myvolume1:')

    will ensure that a directory named 'dataset1' will exist at the top level
    of the cloud volume 'myvolume1', that contains all the contents of
    'dataset1'.  On the other hand,

        sync('~/dataset1/', 'myvolume1:')

    will copy all the contents of 'dataset1' to the top level of 'myvolume1'.
    This behavior mirrors the file-copying tool 'rsync'.

    If *delete* is True, files that exist in *dest* but not in *source* will be
    deleted.  By default, such files will not be removed.
    """
    conn = cloud._getcloudnetconnection()
    retry_attempts = conn.retry_attempts
    dest_is_local = common.is_local_path(dest)
    l_paths, r_paths = (dest, source) if dest_is_local else (source, dest)
    local_paths = common.parse_local_paths(l_paths)
    vol_name, vol_paths = common.parse_remote_paths(r_paths)
    for vol_path in vol_paths:
        if os.path.isabs(vol_path):
            raise cloud.CloudException('Volume path cannot be absolute')

    # acquire syncslot and syncserver info to complete the real remote paths
    success = release = False
    exit_code = -1
    syncserver, syncslot = _acquire_syncslot(vol_name)

    try:
        cloudLog.debug('Acquired syncslot %s on server %s', syncslot, syncserver)
        r_base = '%s@%s:volume/' % (syncslot, syncserver)
        r_paths = ' '.join(['%s%s' % (r_base, v_path) for v_path in vol_paths])
        l_paths = ' '.join(local_paths)
        sync_args = (r_paths, l_paths) if dest_is_local else (l_paths, r_paths)

        for attempt in xrange(retry_attempts):
            exit_code, stdout, stderr = common.rsync_session(*sync_args,
                                                             delete=delete)
            if not exit_code:
                break
            cloudLog.error('sync attempt failed:\n%s', stderr)
            print_stdout(str(stderr))
            print_stdout('Retrying volume sync...')
        else:
            raise Exception('sync failed multiple attempts... '
                            'Please contact PiCloud support')
    except KeyboardInterrupt:
        cloudLog.error('Sync interrupted by keyboard')
        print 'Sync interrupted by keyboard'
    except Exception as e:
        cloudLog.error('Sync errored with:\n%s', e)
        print e
    finally:
        print_stdout('Cleanup...')
        success = not exit_code
        release = success and not dest_is_local
        _send_vol_request('sync_terminate', {'name': vol_name,
                                             'syncslot': syncslot,
                                             'syncserver': syncserver,
                                             'release': release})

    if release:
        print_stdout('Ensuring redundancy...')
        _wait_for_release(vol_name)
    if success:
        print_stdout('Sync successfully completed.')
    else:
        raise cloud.CloudException('Volume sync failed with error code %s. '
                                   'See cloud.log' % exit_code)