Example #1
0
def tests():
    clusters = db.get_cluster_names()
    cluster_scheduled_tests = {}
    cluster_in_progress_tests = {}
    for c in clusters:
        scheduled_tests = db.get_scheduled_tests(c)
        if len(scheduled_tests) > 0:
            cluster_scheduled_tests[c] = scheduled_tests
        in_progress_tests = db.get_in_progress_tests(c)
        if len(in_progress_tests) > 0:
            cluster_in_progress_tests[c] = in_progress_tests
    completed_tests = db.get_completed_tests()
    return render_template('tests.jinja2.html', clusters=clusters, 
                           cluster_scheduled_tests=cluster_scheduled_tests, 
                           cluster_in_progress_tests=cluster_in_progress_tests,
                           completed_tests=completed_tests)
Example #2
0
    def get_work(command):
        # Mark any existing in_process jobs for this cluster as
        # failed. If the cluster is asking for new work, then these
        # got dropped on the floor:
        for test in db.get_in_progress_tests(context['cluster']):
            db.update_test_status(test['test_id'], 'failed')

        # Find the next test scheduled for the client's cluster:
        tests = db.get_scheduled_tests(context['cluster'], limit=1)
        if len(tests) > 0:
            test_id = tests[0]['test_id']
        else:
            # No tests are currently scheduled.
            # Register a zmq listener of notifications of incoming tests, with a timeout.
            # When we see any test scheduled notification for our cluster, redo the query.
            # If timeout reached, redo the query anyway in case we missed the notification.
            def setup_zmq():
                zmq_context = zmq.Context()
                zmq_socket = zmq_context.socket(zmq.SUB)
                zmq_socket.connect('tcp://127.0.0.1:5557')
                zmq_socket.setsockopt_string(
                    zmq.SUBSCRIBE,
                    unicode('scheduled {cluster} '.format(
                        cluster=context['cluster'])))
                zmq_socket.setsockopt(zmq.RCVTIMEO, 15000)
                return zmq_socket

            zmq_socket = setup_zmq()
            while True:
                try:
                    cluster, test_id = zmq_socket.recv_string().split()
                except zmq.error.Again:
                    pass
                except zmq.error.ZMQError, e:
                    if e.errno == zmq.POLLERR:
                        log.error(e)
                        # Interrupted zmq socket code, reinitialize:
                        # I get this when I resize my terminal.. WTF?
                        zmq_socket = setup_zmq()
                finally:
                    tests = db.get_scheduled_tests(context['cluster'], limit=1)
Example #3
0
    def get_work(command):
        # Mark any existing in_process jobs for this cluster as
        # failed. If the cluster is asking for new work, then these
        # got dropped on the floor:
        for test in db.get_in_progress_tests(context['cluster']):
            db.update_test_status(test['test_id'], 'failed')

        # Find the next test scheduled for the client's cluster:
        tests = db.get_scheduled_tests(context['cluster'], limit=1)
        if len(tests) > 0:
            test_id = tests[0]['test_id']
        else:
            # No tests are currently scheduled.
            # Register a zmq listener of notifications of incoming tests, with a timeout.
            # When we see any test scheduled notification for our cluster, redo the query.
            # If timeout reached, redo the query anyway in case we missed the notification.
            def setup_zmq():
                zmq_context = zmq.Context()
                zmq_socket = zmq_context.socket(zmq.SUB)
                zmq_socket.connect('tcp://127.0.0.1:5557')
                zmq_socket.setsockopt_string(
                    zmq.SUBSCRIBE, 
                    unicode('scheduled {cluster} '.format(cluster=context['cluster'])))
                zmq_socket.setsockopt(zmq.RCVTIMEO, 15000)
                return zmq_socket
            zmq_socket = setup_zmq()
            while True:
                try:
                    cluster, test_id = zmq_socket.recv_string().split()
                except zmq.error.Again:
                    pass
                except zmq.error.ZMQError, e:
                    if e.errno == zmq.POLLERR:
                        log.error(e)
                        # Interrupted zmq socket code, reinitialize:
                        # I get this when I resize my terminal.. WTF?
                        zmq_socket = setup_zmq()
                finally:
                    tests = db.get_scheduled_tests(context['cluster'], limit=1)
Example #4
0
def cluster_comms(ws):
    """Websocket to communicate with the test clusters

    Commands are logical actions one end of the socket wishes the
    other end to take. Responses are follow ups to a command, which
    there can be multiple, back and forth between client and server
    until the receiving end marks a response as Done.

    Command structure:
     {type:'command',
      command_id:'some unique string for this command',
      message:'some message to the other end',
      action:'some action for the receiver to take',
      // Extra parameters:
      foo: ...,
      bar: ...,
     }

    Response structure:
     {type:'response',
      command_id:'the command id this is a response to',
      message:'some message to the other end',
      done: true/false (the responder considers the command complete)
      // Extra parameters:
      foo: ...,
      bar: ...,
     }

    Possible commands:
      * authenticate - server asks client to authenticate
      * get_work - client asks for a test
      * test_done - client is done with a test, and sending artifacts
      * cancel_test - server asks client to cancel test
      * shutdown - server asks client to shutdown service

    Protocol:
     Authentication:
      * client initiates connection to this server
      * server sends client a random challenge token
        {type:'command', command_id='zzzz', action:'authenticate', token:'xxxxxxx'}
      * client signs challenge token with it's private key ands sends the signature
        {type:'response', command_id='zzz', cluster:'bdplab', signature:'xxxxxxxx'}
      * server verifies the signature is against the token it sent and the public
        key it has on file for the cluster.
      * server sends a 'you are authenticated' response.
        {type:'response', command_id='zzz', authenticated: true, done:true}

     Task loop:
      * client sends a 'give me work' request.
        {type:'command', command_id='yyy', action:'get_work'}
      * server sends a 'ok, wait for work' response.
        {type:'response', command_id='yyy', action:'wait'}
      * server sends a single test to the cluster
        {type:'response', command_id='yyy', test:{...}}
      * client responds 'ok, received test' response
        {type:'response', command_id:'yyy', test_id:'xxxxxxx'}
      * server updates status of test to in_progress in database
        {type:'response', command_id:'yyy', message:'test_updated', done:true}
      * client sends artifacts via streaming protocol (See below)
      * client sends 'ok, test done, artifacts sent.' request.
        {type:'command', command_id:'llll', action:'test_done', test_id:'xxxxxxx'}
      * server updates status of test to completed
      * server sends a 'ok, test updated' response
        {type:'response', command_id:'llll', test_id:'xxxxxx', message='test_update', done:true}

     Streaming:
      protocol for streaming raw data: console output, binary artifacts etc.
      * Sending peer sends a "I'm going to send binary data to you" request:
        {type:'command', command_id='xxx', action:'stream', test_id='xxxxx', 
         kind:"[console|failure|chart|system_logs|stress_logs]", name='name', 
         eof='$$$EOF$$$', keepalive='$$$KEEPALIVE$$$'}
      * Receiving peer sends response indicating it's ready to receive the stream:
        {type:'response', command_id='xxx', action='ready'}
      * Peer starts sending arbitrary binary data messages.
      * The receiving peer reads binary data. If it encounters $$$KEEPALIVE$$$ as it's own message, it will 
        omit that data, as it's only meant to keep the socket open.
      * Once $$$EOF$$$ is seen by the receiving peer, in it's own message, the receiving peer can respond:
        {type:'response', command_id='xxx', message:'stream_received', done:true}

    """
    context = {'apikey': APIKey.load(SERVER_KEY_PATH),
               'cluster': None}

    def authenticate():
        token_to_sign = random_token()
        cmd = Command.new(ws, action='authenticate', token=token_to_sign)
        response = cmd.send()
        context['cluster'] = cluster = response['cluster']
        client_pubkey = db.get_pub_key(cluster)
        client_apikey = APIKey(client_pubkey['pubkey'])
        
        # Verify the client correctly signed the token:
        try:
            client_apikey.verify_message(token_to_sign, response.get('signature'))
        except:
            response.respond(message='Bad Signature of token for authentication', done=True)
            log.error('client provided bad signature for auth token')
            raise

        response.respond(authenticated=True, done=True)

        # Client will ask us to authenticate too:
        command = receive_data(ws)
        assert command.get('action') == 'authenticate'
        data = {'signature' :context['apikey'].sign_message(command['token'])}
        response = command.respond(**data)
        if response.get('authenticated') != True:
            raise UnauthenticatedError("Our peer could not validate our signed auth token")

    def get_work(command):
        # Mark any existing in_process jobs for this cluster as
        # failed. If the cluster is asking for new work, then these
        # got dropped on the floor:
        for test in db.get_in_progress_tests(context['cluster']):
            db.update_test_status(test['test_id'], 'failed')

        # Find the next test scheduled for the client's cluster:
        tests = db.get_scheduled_tests(context['cluster'], limit=1)
        if len(tests) > 0:
            test_id = tests[0]['test_id']
        else:
            # No tests are currently scheduled.
            # Register a zmq listener of notifications of incoming tests, with a timeout.
            # When we see any test scheduled notification for our cluster, redo the query.
            # If timeout reached, redo the query anyway in case we missed the notification.
            def setup_zmq():
                zmq_context = zmq.Context()
                zmq_socket = zmq_context.socket(zmq.SUB)
                zmq_socket.connect('tcp://127.0.0.1:5557')
                zmq_socket.setsockopt_string(
                    zmq.SUBSCRIBE, 
                    unicode('scheduled {cluster} '.format(cluster=context['cluster'])))
                zmq_socket.setsockopt(zmq.RCVTIMEO, 15000)
                return zmq_socket
            zmq_socket = setup_zmq()
            while True:
                try:
                    cluster, test_id = zmq_socket.recv_string().split()
                except zmq.error.Again:
                    pass
                except zmq.error.ZMQError, e:
                    if e.errno == zmq.POLLERR:
                        log.error(e)
                        # Interrupted zmq socket code, reinitialize:
                        # I get this when I resize my terminal.. WTF?
                        zmq_socket = setup_zmq()
                finally:
                    tests = db.get_scheduled_tests(context['cluster'], limit=1)
                    if len(tests) > 0:
                        test_id = tests[0]['test_id']
                        break
                    else:
                        # Send no-work-yet message:
                        console_publish(context['cluster'], {'ctl':'WAIT'})
                        command.respond(action='wait', follow_up=False)
Example #5
0
def cluster_comms(ws):
    """Websocket to communicate with the test clusters

    Commands are logical actions one end of the socket wishes the
    other end to take. Responses are follow ups to a command, which
    there can be multiple, back and forth between client and server
    until the receiving end marks a response as Done.

    Command structure:
     {type:'command',
      command_id:'some unique string for this command',
      message:'some message to the other end',
      action:'some action for the receiver to take',
      // Extra parameters:
      foo: ...,
      bar: ...,
     }

    Response structure:
     {type:'response',
      command_id:'the command id this is a response to',
      message:'some message to the other end',
      done: true/false (the responder considers the command complete)
      // Extra parameters:
      foo: ...,
      bar: ...,
     }

    Possible commands:
      * authenticate - server asks client to authenticate
      * get_work - client asks for a test
      * test_done - client is done with a test, and sending artifacts
      * cancel_test - server asks client to cancel test
      * shutdown - server asks client to shutdown service

    Protocol:
     Authentication:
      * client initiates connection to this server
      * server sends client a random challenge token
        {type:'command', command_id='zzzz', action:'authenticate', token:'xxxxxxx'}
      * client signs challenge token with it's private key ands sends the signature
        {type:'response', command_id='zzz', cluster:'bdplab', signature:'xxxxxxxx'}
      * server verifies the signature is against the token it sent and the public
        key it has on file for the cluster.
      * server sends a 'you are authenticated' response.
        {type:'response', command_id='zzz', authenticated: true, done:true}

     Task loop:
      * client sends a 'give me work' request.
        {type:'command', command_id='yyy', action:'get_work'}
      * server sends a 'ok, wait for work' response.
        {type:'response', command_id='yyy', action:'wait'}
      * server sends a single test to the cluster
        {type:'response', command_id='yyy', test:{...}}
      * client responds 'ok, received test' response
        {type:'response', command_id:'yyy', test_id:'xxxxxxx'}
      * server updates status of test to in_progress in database
        {type:'response', command_id:'yyy', message:'test_updated', done:true}
      * client sends artifacts via streaming protocol (See below)
      * client sends 'ok, test done, artifacts sent.' request.
        {type:'command', command_id:'llll', action:'test_done', test_id:'xxxxxxx'}
      * server updates status of test to completed
      * server sends a 'ok, test updated' response
        {type:'response', command_id:'llll', test_id:'xxxxxx', message='test_update', done:true}

     Streaming:
      protocol for streaming raw data: console output, binary artifacts etc.
      * Sending peer sends a "I'm going to send binary data to you" request:
        {type:'command', command_id='xxx', action:'stream', test_id='xxxxx', 
         kind:"[console|failure|chart|system_logs|stress_logs]", name='name', 
         eof='$$$EOF$$$', keepalive='$$$KEEPALIVE$$$'}
      * Receiving peer sends response indicating it's ready to receive the stream:
        {type:'response', command_id='xxx', action='ready'}
      * Peer starts sending arbitrary binary data messages.
      * The receiving peer reads binary data. If it encounters $$$KEEPALIVE$$$ as it's own message, it will 
        omit that data, as it's only meant to keep the socket open.
      * Once $$$EOF$$$ is seen by the receiving peer, in it's own message, the receiving peer can respond:
        {type:'response', command_id='xxx', message:'stream_received', done:true}

    """
    context = {'apikey': APIKey.load(SERVER_KEY_PATH), 'cluster': None}

    def authenticate():
        token_to_sign = random_token()
        cmd = Command.new(ws, action='authenticate', token=token_to_sign)
        response = cmd.send()
        context['cluster'] = cluster = response['cluster']
        client_pubkey = db.get_pub_key(cluster)
        client_apikey = APIKey(client_pubkey['pubkey'])

        # Verify the client correctly signed the token:
        try:
            client_apikey.verify_message(token_to_sign,
                                         response.get('signature'))
        except:
            response.respond(
                message='Bad Signature of token for authentication', done=True)
            log.error('client provided bad signature for auth token')
            raise

        response.respond(authenticated=True, done=True)

        # Client will ask us to authenticate too:
        command = receive_data(ws)
        assert command.get('action') == 'authenticate'
        data = {'signature': context['apikey'].sign_message(command['token'])}
        response = command.respond(**data)
        if response.get('authenticated') != True:
            raise UnauthenticatedError(
                "Our peer could not validate our signed auth token")

    def get_work(command):
        # Mark any existing in_process jobs for this cluster as
        # failed. If the cluster is asking for new work, then these
        # got dropped on the floor:
        for test in db.get_in_progress_tests(context['cluster']):
            db.update_test_status(test['test_id'], 'failed')

        # Find the next test scheduled for the client's cluster:
        tests = db.get_scheduled_tests(context['cluster'], limit=1)
        if len(tests) > 0:
            test_id = tests[0]['test_id']
        else:
            # No tests are currently scheduled.
            # Register a zmq listener of notifications of incoming tests, with a timeout.
            # When we see any test scheduled notification for our cluster, redo the query.
            # If timeout reached, redo the query anyway in case we missed the notification.
            def setup_zmq():
                zmq_context = zmq.Context()
                zmq_socket = zmq_context.socket(zmq.SUB)
                zmq_socket.connect('tcp://127.0.0.1:5557')
                zmq_socket.setsockopt_string(
                    zmq.SUBSCRIBE,
                    unicode('scheduled {cluster} '.format(
                        cluster=context['cluster'])))
                zmq_socket.setsockopt(zmq.RCVTIMEO, 15000)
                return zmq_socket

            zmq_socket = setup_zmq()
            while True:
                try:
                    cluster, test_id = zmq_socket.recv_string().split()
                except zmq.error.Again:
                    pass
                except zmq.error.ZMQError, e:
                    if e.errno == zmq.POLLERR:
                        log.error(e)
                        # Interrupted zmq socket code, reinitialize:
                        # I get this when I resize my terminal.. WTF?
                        zmq_socket = setup_zmq()
                finally:
                    tests = db.get_scheduled_tests(context['cluster'], limit=1)
                    if len(tests) > 0:
                        test_id = tests[0]['test_id']
                        break
                    else:
                        # Send no-work-yet message:
                        console_publish(context['cluster'], {'ctl': 'WAIT'})
                        command.respond(action='wait', follow_up=False)