def put_files(files, serverfilter=local_serverfilter): """ Put a bunch of files to a single server in the cluster (chosen by algorithm) This version does not normally get called, but is between put_file and the raw sockets version of put_files in speed. Parameters ---------- files : list of tuple a list of tuples of the form (<string> filepath, <bytes> data) for the files to be uploaded serverfilter: str the cluster name (optional), to select a specific cluster Returns ------- """ files = [(f) for f in files] serverfilter = (serverfilter) name, info = _chooseServer(serverfilter) dir_manager = get_dir_manager(serverfilter) for filename, data in files: unifiedIO.assert_name_ok(filename) url = 'http://%s:%d/%s' % (socket.inet_ntoa( info.address), info.port, filename) t = time.time() #_last_access_time[name] = t #url = str(url) #force to string on py2 s = _getSession(url) r = s.put(url, data=data, timeout=1) dt = time.time() - t #print r.status_code if not r.status_code == 200: raise RuntimeError('Put failed with %d: %s' % (r.status_code, r.content)) dir_manager.register_file(filename, url, len(data)) _lastwritespeed[name] = len(data) / (dt + .001) r.close()
def put_file(filename, data, serverfilter=local_serverfilter, timeout=10): """ Put a file to the cluster. The server on which the file resides is chosen by a crude load-balancing algorithm designed to uniformly distribute data across the servers within the cluster. The target file must not exist. .. warning:: Putting a file is not strictly safe when run from multiple processes, and might result in unexpected behaviour if puts with identical filenames are made concurrently (within ~2s). It is up to the calling code to ensure that such filename collisions cannot occur. In practice this is reasonably easy to achieve when machine generated filenames are used, but implies that interfaces which allow the user to specify arbitrary filenames should run through a single user interface with external locking (e.g. clusterUI), particularly if there is any chance that multiple users will be creating files simultaneously. Parameters ---------- filename : string path to new file, which much not exist data : bytes the data to put serverfilter : string the cluster name (optional) timeout: float timeout in seconds for http operations. **Warning:** alter from the default setting of 1s only with extreme care. If operations are timing out it is usually an indication that something else is going wrong and you should usually fix this first. The serverless and lockless architecture depends on having low latency. Returns ------- """ from . import clusterListing as cl if not isinstance(data, bytes): raise TypeError('data should be bytes (not a unicode string)') unifiedIO.assert_name_ok(filename) success = False nAttempts = 0 while not success and nAttempts < 3: nAttempts += 1 name, info = _chooseServer(serverfilter) url = 'http://%s:%d/%s' % (socket.inet_ntoa( info.address), info.port, filename) print(repr(url)) t = time.time() url = url.encode() try: s = _getSession(url) r = s.put(url, data=data, timeout=timeout) dt = time.time() - t #print r.status_code if not r.status_code == 200: raise RuntimeError('Put failed with %d: %s' % (r.status_code, r.content)) _lastwritespeed[name] = len(data) / (dt + .001) if dt > 1: logger.warning( 'put_file(%s) on %s took more than 1s (%3.2f s)' % (filename, url, dt)) success = True #add file to location cache cache_key = serverfilter + '::' + filename t1 = time.time() _locateCache[cache_key] = ([ (url, .1), ], t1) #modify dir cache try: dirurl, fn = os.path.split(url) dirurl = dirurl + b'/' dirL, rt, dt = _dirCache[dirurl] if (t - rt) > DIR_CACHE_TIME: pass #cache entry is expired else: dirL[fn] = cl.FileInfo(cl.FILETYPE_NORMAL, len(data)) _dirCache[dirurl] = (dirL, rt, dt) except KeyError: pass except requests.ConnectTimeout: if nAttempts >= 3: logger.error( 'Timeout attempting to put file: %s, after 3 retries, aborting' % url) raise else: logger.warn('Timeout attempting to put file: %s, retrying' % url) finally: try: r.close() except: pass
def put_files(files, serverfilter=local_serverfilter, timeout=30): """ Put a bunch of files to a single server in the cluster (chosen by algorithm) This uses a long-lived http2 session with keep-alive to avoid the connection overhead in creating a new session for each file, and puts files before waiting for a response to the last put. This function exists to facilitate fast streaming As it reads the replies *after* attempting to put all the files, this is currently not as safe as put_file (in handling failures we assume that no attempts were successful after the first failed file). Parameters ---------- files : list of tuple a list of tuples of the form (<string> filepath, <bytes> data) for the files to be uploaded serverfilter: str the cluster name (optional), to select a specific cluster Returns ------- """ files = [(f) for f in files] serverfilter = (serverfilter) nRetries = 0 nChunksRemaining = len(files) while nRetries < 3 and nChunksRemaining > 0: name, info = _chooseServer(serverfilter) #logger.debug('Chose server: %s:%d' % (name, info.port)) try: t = time.time() s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) s.settimeout(30) #conect to the server s.connect((socket.inet_ntoa(info.address), info.port)) datalen = 0 #_last_access_time[name] = t rs = [] #nChunksRemaining = len(files) connection = b'keep-alive' #pipeline the sends nChunksSpooled = 0 while nChunksRemaining > 0: filename, data = files[-nChunksRemaining] unifiedIO.assert_name_ok(filename) dl = len(data) if nChunksRemaining <= 1: connection = b'close' header = b'PUT /%s HTTP/1.1\r\nConnection: %s\r\nContent-Length: %d\r\n\r\n' % ( filename.encode(), connection, dl) s.sendall(header) s.sendall(data) datalen += dl nChunksSpooled += 1 nChunksRemaining -= 1 # # TODO - FIXME so that reading replies is fast enough # for i in range(nChunksSpooled): # #read all our replies # #print(i, files[i][0]) # resp = httplib.HTTPResponse(s, buffering=False) # resp.begin() # status = resp.status # msg = resp.read() # if not status == 200: # logging.debug(('Response %d - status: %d' % (i,status)) + ' msg: ' + msg) # raise RuntimeError('Error spooling chunk %d: status: %d, msg: %s' % (i, status, msg)) fp = s.makefile('rb', 65536) try: for i in range(nChunksSpooled): status, reason, msg = _parse_response(fp) if not status == 200: logging.error(('Response %d - status: %d' % (i, status)) + ' msg: ' + str(msg)) raise RuntimeError( 'Error spooling chunk %d: status: %d, msg: %s' % (i, status, str(msg))) finally: fp.close() dt = time.time() - t _lastwritespeed[name] = datalen / (dt + .001) except socket.timeout: if nRetries < 2: nRetries += 1 logger.error( 'Timeout writing to %s, trying another server for %d remaining files' % (socket.inet_ntoa(info.address), nChunksRemaining)) else: logger.exception( 'Timeout writing to %s after 3 retries, aborting - DATA WILL BE LOST' % socket.inet_ntoa(info.address)) raise except socket.error: if nRetries < 2: nRetries += 1 logger.exception( 'Error writing to %s, trying another server for %d remaining files' % (socket.inet_ntoa(info.address), nChunksRemaining)) else: logger.exception( 'Error writing to %s after 3 retries, aborting - DATA WILL BE LOST' % socket.inet_ntoa(info.address)) raise finally: # this causes the far end to close the connection after sending all the replies # it is important for the connection to close, otherwise the subsequent recieves will block forever # TODO: This is probably a bug/feature of SimpleHTTPServer. The correct way of doing this is probably to send # a "Connection: close" header in the last request. # s.sendall('\r\n') # try: # #perform all the recieves at once # resp = s.recv(4096) # while len(resp) > 0: # resp = s.recv(4096) # except: # logger.error('Failure to read from server %s' % socket.inet_ntoa(info.address)) # s.close() # raise #print resp #TODO: Parse responses s.close()
def _put_files_on_server(address, port, files, nChunksRemaining=None, dir_manager=None, serverfilter=local_serverfilter): if nChunksRemaining is None: nChunksRemaining = len(files) if dir_manager is None: dir_manager = get_dir_manager(serverfilter) if not isinstance(address, str): address = socket.inet_ntoa(address) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) s.settimeout(30) #conect to the server s.connect((address, port)) try: datalen = 0 #_last_access_time[name] = t url_ = 'http://%s:%d/' % (address, port) rs = [] #nChunksRemaining = len(files) connection = b'keep-alive' #pipeline the sends nChunksSpooled = 0 while nChunksRemaining > 0: filename, data = files[-nChunksRemaining] unifiedIO.assert_name_ok(filename) dl = len(data) if nChunksRemaining <= 1: connection = b'close' header = b'PUT /%s HTTP/1.1\r\nConnection: %s\r\nContent-Length: %d\r\n\r\n' % ( filename.encode(), connection, dl) s.sendall(header) s.sendall(data) # register file now (TODO - wait until we get spooling confirmation?) url = url_ + filename dir_manager.register_file(filename, url, dl) datalen += dl nChunksSpooled += 1 nChunksRemaining -= 1 # # TODO - FIXME so that reading replies is fast enough # for i in range(nChunksSpooled): # #read all our replies # #print(i, files[i][0]) # resp = httplib.HTTPResponse(s, buffering=False) # resp.begin() # status = resp.status # msg = resp.read() # if not status == 200: # logging.debug(('Response %d - status: %d' % (i,status)) + ' msg: ' + msg) # raise RuntimeError('Error spooling chunk %d: status: %d, msg: %s' % (i, status, msg)) fp = s.makefile('rb', 65536) try: for i in range(nChunksSpooled): status, reason, msg = _parse_response(fp) if not status == 200: logging.error(('Response %d - status: %d' % (i, status)) + ' msg: ' + str(msg)) raise RuntimeError( 'Error spooling chunk %d: status: %d, msg: %s' % (i, status, str(msg))) finally: fp.close() finally: # this causes the far end to close the connection after sending all the replies # it is important for the connection to close, otherwise the subsequent recieves will block forever # TODO: This is probably a bug/feature of SimpleHTTPServer. The correct way of doing this is probably to send # a "Connection: close" header in the last request. # s.sendall('\r\n') # try: # #perform all the recieves at once # resp = s.recv(4096) # while len(resp) > 0: # resp = s.recv(4096) # except: # logger.error('Failure to read from server %s' % socket.inet_ntoa(info.address)) # s.close() # raise #print resp #TODO: Parse responses s.close() return nChunksRemaining, datalen