def _handle_response(self, offset_stack, coalesced, response_handler, data_map, next_offset): cur_offset_and_size = next_offset[0] # FIXME: this should know how many bytes are needed, for clarity. data = response_handler.read_body_bytes() data_offset = 0 for c_offset in coalesced: if len(data) < c_offset.length: raise errors.ShortReadvError(relpath, c_offset.start, c_offset.length, actual=len(data)) for suboffset, subsize in c_offset.ranges: key = (c_offset.start + suboffset, subsize) this_data = data[data_offset + suboffset:data_offset + suboffset + subsize] # Special case when the data is in-order, rather than packing # into a map and then back out again. Benchmarking shows that # this has 100% hit rate, but leave in the data_map work just # in case. # TODO: Could we get away with using buffer() to avoid the # memory copy? Callers would need to realize they may # not have a real string. if key == cur_offset_and_size: yield cur_offset_and_size[0], this_data cur_offset_and_size = next_offset[0] = offset_stack.next() else: data_map[key] = this_data data_offset += c_offset.length # Now that we've read some data, see if we can yield anything back while cur_offset_and_size in data_map: this_data = data_map.pop(cur_offset_and_size) yield cur_offset_and_size[0], this_data cur_offset_and_size = next_offset[0] = offset_stack.next()
def _translate_error(self, resp, orig_path=None): """Raise an exception from a response""" if resp is None: what = None else: what = resp[0] if what == 'ok': return elif what == 'NoSuchFile': if orig_path is not None: error_path = orig_path else: error_path = resp[1] raise errors.NoSuchFile(error_path) elif what == 'error': raise errors.SmartProtocolError(unicode(resp[1])) elif what == 'FileExists': raise errors.FileExists(resp[1]) elif what == 'DirectoryNotEmpty': raise errors.DirectoryNotEmpty(resp[1]) elif what == 'ShortReadvError': raise errors.ShortReadvError(resp[1], int(resp[2]), int(resp[3]), int(resp[4])) elif what in ('UnicodeEncodeError', 'UnicodeDecodeError'): encoding = str(resp[1]) # encoding must always be a string val = resp[2] start = int(resp[3]) end = int(resp[4]) reason = str(resp[5]) # reason must always be a string if val.startswith('u:'): val = val[2:].decode('utf-8') elif val.startswith('s:'): val = val[2:].decode('base64') if what == 'UnicodeDecodeError': raise UnicodeDecodeError(encoding, val, start, end, reason) elif what == 'UnicodeEncodeError': raise UnicodeEncodeError(encoding, val, start, end, reason) elif what == "ReadOnlyError": raise errors.TransportNotPossible('readonly transport') elif what == "ReadError": if orig_path is not None: error_path = orig_path else: error_path = resp[1] raise errors.ReadError(error_path) elif what == "PermissionDenied": if orig_path is not None: error_path = orig_path else: error_path = resp[1] raise errors.PermissionDenied(error_path) else: raise errors.SmartProtocolError('unexpected smart server error: %r' % (resp,))
def _checked_read(self, size): """Read the file checking for short reads. The data read is discarded along the way. """ pos = self._pos remaining = size while remaining > 0: data = self._file.read(min(remaining, self._discarded_buf_size)) remaining -= len(data) if not data: raise errors.ShortReadvError(self._path, pos, size, size - remaining) self._pos += size
def _curl_perform(self, curl, header, more_headers=[]): """Perform curl operation and translate exceptions.""" try: # There's no way in http/1.0 to say "must # revalidate"; we don't want to force it to always # retrieve. so just turn off the default Pragma # provided by Curl. headers = [ 'Cache-control: max-age=0', 'Pragma: no-cache', 'Connection: Keep-Alive' ] curl.setopt(pycurl.HTTPHEADER, headers + more_headers) curl.perform() except pycurl.error, e: url = curl.getinfo(pycurl.EFFECTIVE_URL) trace.mutter('got pycurl error: %s, %s, %s, url: %s ', e[0], e[1], e, url) if e[0] in ( CURLE_COULDNT_RESOLVE_HOST, CURLE_COULDNT_RESOLVE_PROXY, CURLE_COULDNT_CONNECT, CURLE_FTP_WEIRD_SERVER_REPLY, CURLE_GOT_NOTHING, CURLE_SSL_CACERT, CURLE_SSL_CACERT_BADFILE, ): raise errors.ConnectionError( 'curl connection error (%s)\non %s' % (e[1], url)) elif e[0] == CURLE_RECV_ERROR: raise errors.ConnectionReset( 'curl connection error (%s)\non %s' % (e[1], url)) elif e[0] == CURLE_PARTIAL_FILE: # Pycurl itself has detected a short read. We do not have all # the information for the ShortReadvError, but that should be # enough raise errors.ShortReadvError( url, offset='unknown', length='unknown', actual='unknown', extra='Server aborted the request') raise
def _readv(self, relpath, offsets): """Get parts of the file at the given relative path. :param offsets: A list of (offset, size) tuples. :param return: A list or generator of (offset, data) tuples """ # offsets may be a generator, we will iterate it several times, so # build a list offsets = list(offsets) try_again = True retried_offset = None while try_again: try_again = False # Coalesce the offsets to minimize the GET requests issued sorted_offsets = sorted(offsets) coalesced = self._coalesce_offsets( sorted_offsets, limit=self._max_readv_combine, fudge_factor=self._bytes_to_read_before_seek, max_size=self._get_max_size) # Turn it into a list, we will iterate it several times coalesced = list(coalesced) if 'http' in debug.debug_flags: mutter('http readv of %s offsets => %s collapsed %s', relpath, len(offsets), len(coalesced)) # Cache the data read, but only until it's been used data_map = {} # We will iterate on the data received from the GET requests and # serve the corresponding offsets respecting the initial order. We # need an offset iterator for that. iter_offsets = iter(offsets) cur_offset_and_size = iter_offsets.next() try: for cur_coal, rfile in self._coalesce_readv( relpath, coalesced): # Split the received chunk for offset, size in cur_coal.ranges: start = cur_coal.start + offset rfile.seek(start, os.SEEK_SET) data = rfile.read(size) data_len = len(data) if data_len != size: raise errors.ShortReadvError(relpath, start, size, actual=data_len) if (start, size) == cur_offset_and_size: # The offset requested are sorted as the coalesced # ones, no need to cache. Win ! yield cur_offset_and_size[0], data cur_offset_and_size = iter_offsets.next() else: # Different sorting. We need to cache. data_map[(start, size)] = data # Yield everything we can while cur_offset_and_size in data_map: # Clean the cached data since we use it # XXX: will break if offsets contains duplicates -- # vila20071129 this_data = data_map.pop(cur_offset_and_size) yield cur_offset_and_size[0], this_data cur_offset_and_size = iter_offsets.next() except (errors.ShortReadvError, errors.InvalidRange, errors.InvalidHttpRange, errors.HttpBoundaryMissing), e: mutter('Exception %r: %s during http._readv', e, e) if (not isinstance(e, errors.ShortReadvError) or retried_offset == cur_offset_and_size): # We don't degrade the range hint for ShortReadvError since # they do not indicate a problem with the server ability to # handle ranges. Except when we fail to get back a required # offset twice in a row. In that case, falling back to # single range or whole file should help or end up in a # fatal exception. self._degrade_range_hint(relpath, coalesced, sys.exc_info()) # Some offsets may have been already processed, so we retry # only the unsuccessful ones. offsets = [cur_offset_and_size] + [o for o in iter_offsets] retried_offset = cur_offset_and_size try_again = True
def request_and_yield_offsets(self, fp): """Request the data from the remote machine, yielding the results. :param fp: A Paramiko SFTPFile object that supports readv. :return: Yield the data requested by the original readv caller, one by one. """ requests = self._get_requests() offset_iter = iter(self.original_offsets) cur_offset, cur_size = offset_iter.next() # paramiko .readv() yields strings that are in the order of the requests # So we track the current request to know where the next data is # being returned from. input_start = None last_end = None buffered_data = [] buffered_len = 0 # This is used to buffer chunks which we couldn't process yet # It is (start, end, data) tuples. data_chunks = [] # Create an 'unlimited' data stream, so we stop based on requests, # rather than just because the data stream ended. This lets us detect # short readv. data_stream = itertools.chain(fp.readv(requests), itertools.repeat(None)) for (start, length), data in itertools.izip(requests, data_stream): if data is None: if cur_coalesced is not None: raise errors.ShortReadvError(self.relpath, start, length, len(data)) if len(data) != length: raise errors.ShortReadvError(self.relpath, start, length, len(data)) self._report_activity(length, 'read') if last_end is None: # This is the first request, just buffer it buffered_data = [data] buffered_len = length input_start = start elif start == last_end: # The data we are reading fits neatly on the previous # buffer, so this is all part of a larger coalesced range. buffered_data.append(data) buffered_len += length else: # We have an 'interrupt' in the data stream. So we know we are # at a request boundary. if buffered_len > 0: # We haven't consumed the buffer so far, so put it into # data_chunks, and continue. buffered = ''.join(buffered_data) data_chunks.append((input_start, buffered)) input_start = start buffered_data = [data] buffered_len = length last_end = start + length if input_start == cur_offset and cur_size <= buffered_len: # Simplify the next steps a bit by transforming buffered_data # into a single string. We also have the nice property that # when there is only one string ''.join([x]) == x, so there is # no data copying. buffered = ''.join(buffered_data) # Clean out buffered data so that we keep memory # consumption low del buffered_data[:] buffered_offset = 0 # TODO: We *could* also consider the case where cur_offset is in # in the buffered range, even though it doesn't *start* # the buffered range. But for packs we pretty much always # read in order, so you won't get any extra data in the # middle. while (input_start == cur_offset and (buffered_offset + cur_size) <= buffered_len): # We've buffered enough data to process this request, spit it # out cur_data = buffered[buffered_offset:buffered_offset + cur_size] # move the direct pointer into our buffered data buffered_offset += cur_size # Move the start-of-buffer pointer input_start += cur_size # Yield the requested data yield cur_offset, cur_data cur_offset, cur_size = offset_iter.next() # at this point, we've consumed as much of buffered as we can, # so break off the portion that we consumed if buffered_offset == len(buffered_data): # No tail to leave behind buffered_data = [] buffered_len = 0 else: buffered = buffered[buffered_offset:] buffered_data = [buffered] buffered_len = len(buffered) # now that the data stream is done, close the handle fp.close() if buffered_len: buffered = ''.join(buffered_data) del buffered_data[:] data_chunks.append((input_start, buffered)) if data_chunks: if 'sftp' in debug.debug_flags: mutter('SFTP readv left with %d out-of-order bytes', sum(map(lambda x: len(x[1]), data_chunks))) # We've processed all the readv data, at this point, anything we # couldn't process is in data_chunks. This doesn't happen often, so # this code path isn't optimized # We use an interesting process for data_chunks # Specifically if we have "bisect_left([(start, len, entries)], # (qstart,)]) # If start == qstart, then we get the specific node. Otherwise we # get the previous node while True: idx = bisect.bisect_left(data_chunks, (cur_offset, )) if idx < len( data_chunks) and data_chunks[idx][0] == cur_offset: # The data starts here data = data_chunks[idx][1][:cur_size] elif idx > 0: # The data is in a portion of a previous page idx -= 1 sub_offset = cur_offset - data_chunks[idx][0] data = data_chunks[idx][1] data = data[sub_offset:sub_offset + cur_size] else: # We are missing the page where the data should be found, # something is wrong data = '' if len(data) != cur_size: raise AssertionError( 'We must have miscalulated.' ' We expected %d bytes, but only found %d' % (cur_size, len(data))) yield cur_offset, data cur_offset, cur_size = offset_iter.next()