Ejemplo n.º 1
0
    def put_files(files, serverfilter=local_serverfilter):
        """
        Put a bunch of files to a single server in the cluster (chosen by algorithm)
        
        This version does not normally get called, but is between put_file and the raw sockets version of put_files in speed.
        
        Parameters
        ----------
        files : list of tuple
            a list of tuples of the form (<string> filepath, <bytes> data) for the files to be uploaded
            
        serverfilter: str
            the cluster name (optional), to select a specific cluster

        Returns
        -------

        """
        files = [(f) for f in files]
        serverfilter = (serverfilter)

        name, info = _chooseServer(serverfilter)
        dir_manager = get_dir_manager(serverfilter)

        for filename, data in files:
            unifiedIO.assert_name_ok(filename)
            url = 'http://%s:%d/%s' % (socket.inet_ntoa(
                info.address), info.port, filename)

            t = time.time()
            #_last_access_time[name] = t
            #url = str(url) #force to string on py2
            s = _getSession(url)
            r = s.put(url, data=data, timeout=1)
            dt = time.time() - t
            #print r.status_code
            if not r.status_code == 200:
                raise RuntimeError('Put failed with %d: %s' %
                                   (r.status_code, r.content))

            dir_manager.register_file(filename, url, len(data))
            _lastwritespeed[name] = len(data) / (dt + .001)

            r.close()
Ejemplo n.º 2
0
def put_file(filename, data, serverfilter=local_serverfilter, timeout=10):
    """
    Put a file to the cluster. The server on which the file resides is chosen by a crude load-balancing algorithm
    designed to uniformly distribute data across the servers within the cluster. The target file must not exist.
    
    .. warning::
        Putting a file is not strictly safe when run from multiple processes, and might result in unexpected behaviour
        if puts with identical filenames are made concurrently (within ~2s). It is up to the calling code to ensure that such
        filename collisions cannot occur. In practice this is reasonably easy to achieve when machine generated filenames
        are used, but implies that interfaces which allow the user to specify arbitrary filenames should run through a
        single user interface with external locking (e.g. clusterUI), particularly if there is any chance that multiple
        users will be creating files simultaneously.
    
    Parameters
    ----------
    filename : string
        path to new file, which much not exist
    data : bytes
        the data to put
    serverfilter : string
        the cluster name (optional)
    timeout: float
        timeout in seconds for http operations. **Warning:** alter from the default setting of 1s only with extreme care.
        If operations are timing out it is usually an indication that something else is going wrong and you should usually
        fix this first. The serverless and lockless architecture depends on having low latency.

    Returns
    -------

    """

    from . import clusterListing as cl

    if not isinstance(data, bytes):
        raise TypeError('data should be bytes (not a unicode string)')
    unifiedIO.assert_name_ok(filename)

    success = False
    nAttempts = 0

    while not success and nAttempts < 3:
        nAttempts += 1
        name, info = _chooseServer(serverfilter)

        url = 'http://%s:%d/%s' % (socket.inet_ntoa(
            info.address), info.port, filename)
        print(repr(url))

        t = time.time()

        url = url.encode()
        try:
            s = _getSession(url)
            r = s.put(url, data=data, timeout=timeout)
            dt = time.time() - t
            #print r.status_code
            if not r.status_code == 200:
                raise RuntimeError('Put failed with %d: %s' %
                                   (r.status_code, r.content))

            _lastwritespeed[name] = len(data) / (dt + .001)

            if dt > 1:
                logger.warning(
                    'put_file(%s) on %s took more than 1s (%3.2f s)' %
                    (filename, url, dt))

            success = True

            #add file to location cache
            cache_key = serverfilter + '::' + filename
            t1 = time.time()
            _locateCache[cache_key] = ([
                (url, .1),
            ], t1)

            #modify dir cache
            try:
                dirurl, fn = os.path.split(url)
                dirurl = dirurl + b'/'
                dirL, rt, dt = _dirCache[dirurl]
                if (t - rt) > DIR_CACHE_TIME:
                    pass  #cache entry is expired
                else:
                    dirL[fn] = cl.FileInfo(cl.FILETYPE_NORMAL, len(data))
                    _dirCache[dirurl] = (dirL, rt, dt)

            except KeyError:
                pass

        except requests.ConnectTimeout:
            if nAttempts >= 3:
                logger.error(
                    'Timeout attempting to put file: %s, after 3 retries, aborting'
                    % url)
                raise
            else:
                logger.warn('Timeout attempting to put file: %s, retrying' %
                            url)
        finally:
            try:
                r.close()
            except:
                pass
Ejemplo n.º 3
0
    def put_files(files, serverfilter=local_serverfilter, timeout=30):
        """
        Put a bunch of files to a single server in the cluster (chosen by algorithm)
        
        This uses a long-lived http2 session with keep-alive to avoid the connection overhead in creating a new
        session for each file, and puts files before waiting for a response to the last put. This function exists to
        facilitate fast streaming
        
        As it reads the replies *after* attempting to put all the files, this is currently not as safe as put_file (in
        handling failures we assume that no attempts were successful after the first failed file).
        
        Parameters
        ----------
        files : list of tuple
            a list of tuples of the form (<string> filepath, <bytes> data) for the files to be uploaded
            
        serverfilter: str
            the cluster name (optional), to select a specific cluster

        Returns
        -------

        """

        files = [(f) for f in files]
        serverfilter = (serverfilter)

        nRetries = 0
        nChunksRemaining = len(files)

        while nRetries < 3 and nChunksRemaining > 0:
            name, info = _chooseServer(serverfilter)
            #logger.debug('Chose server: %s:%d' % (name, info.port))
            try:
                t = time.time()
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
                s.settimeout(30)

                #conect to the server
                s.connect((socket.inet_ntoa(info.address), info.port))

                datalen = 0

                #_last_access_time[name] = t

                rs = []

                #nChunksRemaining = len(files)
                connection = b'keep-alive'
                #pipeline the sends

                nChunksSpooled = 0
                while nChunksRemaining > 0:
                    filename, data = files[-nChunksRemaining]
                    unifiedIO.assert_name_ok(filename)
                    dl = len(data)
                    if nChunksRemaining <= 1:
                        connection = b'close'

                    header = b'PUT /%s HTTP/1.1\r\nConnection: %s\r\nContent-Length: %d\r\n\r\n' % (
                        filename.encode(), connection, dl)
                    s.sendall(header)
                    s.sendall(data)

                    datalen += dl
                    nChunksSpooled += 1
                    nChunksRemaining -= 1

                # # TODO - FIXME so that reading replies is fast enough
                # for i in range(nChunksSpooled):
                #     #read all our replies
                #     #print(i, files[i][0])
                #     resp = httplib.HTTPResponse(s, buffering=False)
                #     resp.begin()
                #     status = resp.status
                #     msg = resp.read()
                #     if not status == 200:
                #         logging.debug(('Response %d - status: %d' % (i,status)) + ' msg: ' + msg)
                #         raise RuntimeError('Error spooling chunk %d: status: %d, msg: %s' % (i, status, msg))

                fp = s.makefile('rb', 65536)
                try:
                    for i in range(nChunksSpooled):
                        status, reason, msg = _parse_response(fp)
                        if not status == 200:
                            logging.error(('Response %d - status: %d' %
                                           (i, status)) + ' msg: ' + str(msg))
                            raise RuntimeError(
                                'Error spooling chunk %d: status: %d, msg: %s'
                                % (i, status, str(msg)))
                finally:
                    fp.close()

                dt = time.time() - t
                _lastwritespeed[name] = datalen / (dt + .001)

            except socket.timeout:
                if nRetries < 2:
                    nRetries += 1
                    logger.error(
                        'Timeout writing to %s, trying another server for %d remaining files'
                        % (socket.inet_ntoa(info.address), nChunksRemaining))
                else:
                    logger.exception(
                        'Timeout writing to %s after 3 retries, aborting - DATA WILL BE LOST'
                        % socket.inet_ntoa(info.address))
                    raise

            except socket.error:
                if nRetries < 2:
                    nRetries += 1
                    logger.exception(
                        'Error writing to %s, trying another server for %d remaining files'
                        % (socket.inet_ntoa(info.address), nChunksRemaining))
                else:
                    logger.exception(
                        'Error writing to %s after 3 retries, aborting - DATA WILL BE LOST'
                        % socket.inet_ntoa(info.address))
                    raise

            finally:
                # this causes the far end to close the connection after sending all the replies
                # it is important for the connection to close, otherwise the subsequent recieves will block forever
                # TODO: This is probably a bug/feature of SimpleHTTPServer. The correct way of doing this is probably to send
                # a "Connection: close" header in the last request.
                # s.sendall('\r\n')

                # try:
                #     #perform all the recieves at once
                #     resp = s.recv(4096)
                #     while len(resp) > 0:
                #         resp = s.recv(4096)
                # except:
                #     logger.error('Failure to read from server %s' % socket.inet_ntoa(info.address))
                #     s.close()
                #     raise
                #print resp
                #TODO: Parse responses
                s.close()
Ejemplo n.º 4
0
    def _put_files_on_server(address,
                             port,
                             files,
                             nChunksRemaining=None,
                             dir_manager=None,
                             serverfilter=local_serverfilter):
        if nChunksRemaining is None:
            nChunksRemaining = len(files)

        if dir_manager is None:
            dir_manager = get_dir_manager(serverfilter)

        if not isinstance(address, str):
            address = socket.inet_ntoa(address)

        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
        s.settimeout(30)

        #conect to the server
        s.connect((address, port))

        try:
            datalen = 0

            #_last_access_time[name] = t

            url_ = 'http://%s:%d/' % (address, port)

            rs = []

            #nChunksRemaining = len(files)
            connection = b'keep-alive'
            #pipeline the sends

            nChunksSpooled = 0
            while nChunksRemaining > 0:
                filename, data = files[-nChunksRemaining]
                unifiedIO.assert_name_ok(filename)
                dl = len(data)
                if nChunksRemaining <= 1:
                    connection = b'close'

                header = b'PUT /%s HTTP/1.1\r\nConnection: %s\r\nContent-Length: %d\r\n\r\n' % (
                    filename.encode(), connection, dl)
                s.sendall(header)
                s.sendall(data)

                # register file now (TODO - wait until we get spooling confirmation?)
                url = url_ + filename
                dir_manager.register_file(filename, url, dl)

                datalen += dl
                nChunksSpooled += 1
                nChunksRemaining -= 1

            # # TODO - FIXME so that reading replies is fast enough
            # for i in range(nChunksSpooled):
            #     #read all our replies
            #     #print(i, files[i][0])
            #     resp = httplib.HTTPResponse(s, buffering=False)
            #     resp.begin()
            #     status = resp.status
            #     msg = resp.read()
            #     if not status == 200:
            #         logging.debug(('Response %d - status: %d' % (i,status)) + ' msg: ' + msg)
            #         raise RuntimeError('Error spooling chunk %d: status: %d, msg: %s' % (i, status, msg))

            fp = s.makefile('rb', 65536)
            try:
                for i in range(nChunksSpooled):
                    status, reason, msg = _parse_response(fp)
                    if not status == 200:
                        logging.error(('Response %d - status: %d' %
                                       (i, status)) + ' msg: ' + str(msg))
                        raise RuntimeError(
                            'Error spooling chunk %d: status: %d, msg: %s' %
                            (i, status, str(msg)))
            finally:
                fp.close()

        finally:
            # this causes the far end to close the connection after sending all the replies
            # it is important for the connection to close, otherwise the subsequent recieves will block forever
            # TODO: This is probably a bug/feature of SimpleHTTPServer. The correct way of doing this is probably to send
            # a "Connection: close" header in the last request.
            # s.sendall('\r\n')

            # try:
            #     #perform all the recieves at once
            #     resp = s.recv(4096)
            #     while len(resp) > 0:
            #         resp = s.recv(4096)
            # except:
            #     logger.error('Failure to read from server %s' % socket.inet_ntoa(info.address))
            #     s.close()
            #     raise
            #print resp
            #TODO: Parse responses
            s.close()

        return nChunksRemaining, datalen