def tests(): clusters = db.get_cluster_names() cluster_scheduled_tests = {} cluster_in_progress_tests = {} for c in clusters: scheduled_tests = db.get_scheduled_tests(c) if len(scheduled_tests) > 0: cluster_scheduled_tests[c] = scheduled_tests in_progress_tests = db.get_in_progress_tests(c) if len(in_progress_tests) > 0: cluster_in_progress_tests[c] = in_progress_tests completed_tests = db.get_completed_tests() return render_template('tests.jinja2.html', clusters=clusters, cluster_scheduled_tests=cluster_scheduled_tests, cluster_in_progress_tests=cluster_in_progress_tests, completed_tests=completed_tests)
def get_work(command): # Mark any existing in_process jobs for this cluster as # failed. If the cluster is asking for new work, then these # got dropped on the floor: for test in db.get_in_progress_tests(context['cluster']): db.update_test_status(test['test_id'], 'failed') # Find the next test scheduled for the client's cluster: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] else: # No tests are currently scheduled. # Register a zmq listener of notifications of incoming tests, with a timeout. # When we see any test scheduled notification for our cluster, redo the query. # If timeout reached, redo the query anyway in case we missed the notification. def setup_zmq(): zmq_context = zmq.Context() zmq_socket = zmq_context.socket(zmq.SUB) zmq_socket.connect('tcp://127.0.0.1:5557') zmq_socket.setsockopt_string( zmq.SUBSCRIBE, unicode('scheduled {cluster} '.format( cluster=context['cluster']))) zmq_socket.setsockopt(zmq.RCVTIMEO, 15000) return zmq_socket zmq_socket = setup_zmq() while True: try: cluster, test_id = zmq_socket.recv_string().split() except zmq.error.Again: pass except zmq.error.ZMQError, e: if e.errno == zmq.POLLERR: log.error(e) # Interrupted zmq socket code, reinitialize: # I get this when I resize my terminal.. WTF? zmq_socket = setup_zmq() finally: tests = db.get_scheduled_tests(context['cluster'], limit=1)
def get_work(command): # Mark any existing in_process jobs for this cluster as # failed. If the cluster is asking for new work, then these # got dropped on the floor: for test in db.get_in_progress_tests(context['cluster']): db.update_test_status(test['test_id'], 'failed') # Find the next test scheduled for the client's cluster: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] else: # No tests are currently scheduled. # Register a zmq listener of notifications of incoming tests, with a timeout. # When we see any test scheduled notification for our cluster, redo the query. # If timeout reached, redo the query anyway in case we missed the notification. def setup_zmq(): zmq_context = zmq.Context() zmq_socket = zmq_context.socket(zmq.SUB) zmq_socket.connect('tcp://127.0.0.1:5557') zmq_socket.setsockopt_string( zmq.SUBSCRIBE, unicode('scheduled {cluster} '.format(cluster=context['cluster']))) zmq_socket.setsockopt(zmq.RCVTIMEO, 15000) return zmq_socket zmq_socket = setup_zmq() while True: try: cluster, test_id = zmq_socket.recv_string().split() except zmq.error.Again: pass except zmq.error.ZMQError, e: if e.errno == zmq.POLLERR: log.error(e) # Interrupted zmq socket code, reinitialize: # I get this when I resize my terminal.. WTF? zmq_socket = setup_zmq() finally: tests = db.get_scheduled_tests(context['cluster'], limit=1)
def cluster_comms(ws): """Websocket to communicate with the test clusters Commands are logical actions one end of the socket wishes the other end to take. Responses are follow ups to a command, which there can be multiple, back and forth between client and server until the receiving end marks a response as Done. Command structure: {type:'command', command_id:'some unique string for this command', message:'some message to the other end', action:'some action for the receiver to take', // Extra parameters: foo: ..., bar: ..., } Response structure: {type:'response', command_id:'the command id this is a response to', message:'some message to the other end', done: true/false (the responder considers the command complete) // Extra parameters: foo: ..., bar: ..., } Possible commands: * authenticate - server asks client to authenticate * get_work - client asks for a test * test_done - client is done with a test, and sending artifacts * cancel_test - server asks client to cancel test * shutdown - server asks client to shutdown service Protocol: Authentication: * client initiates connection to this server * server sends client a random challenge token {type:'command', command_id='zzzz', action:'authenticate', token:'xxxxxxx'} * client signs challenge token with it's private key ands sends the signature {type:'response', command_id='zzz', cluster:'bdplab', signature:'xxxxxxxx'} * server verifies the signature is against the token it sent and the public key it has on file for the cluster. * server sends a 'you are authenticated' response. {type:'response', command_id='zzz', authenticated: true, done:true} Task loop: * client sends a 'give me work' request. {type:'command', command_id='yyy', action:'get_work'} * server sends a 'ok, wait for work' response. {type:'response', command_id='yyy', action:'wait'} * server sends a single test to the cluster {type:'response', command_id='yyy', test:{...}} * client responds 'ok, received test' response {type:'response', command_id:'yyy', test_id:'xxxxxxx'} * server updates status of test to in_progress in database {type:'response', command_id:'yyy', message:'test_updated', done:true} * client sends artifacts via streaming protocol (See below) * client sends 'ok, test done, artifacts sent.' request. {type:'command', command_id:'llll', action:'test_done', test_id:'xxxxxxx'} * server updates status of test to completed * server sends a 'ok, test updated' response {type:'response', command_id:'llll', test_id:'xxxxxx', message='test_update', done:true} Streaming: protocol for streaming raw data: console output, binary artifacts etc. * Sending peer sends a "I'm going to send binary data to you" request: {type:'command', command_id='xxx', action:'stream', test_id='xxxxx', kind:"[console|failure|chart|system_logs|stress_logs]", name='name', eof='$$$EOF$$$', keepalive='$$$KEEPALIVE$$$'} * Receiving peer sends response indicating it's ready to receive the stream: {type:'response', command_id='xxx', action='ready'} * Peer starts sending arbitrary binary data messages. * The receiving peer reads binary data. If it encounters $$$KEEPALIVE$$$ as it's own message, it will omit that data, as it's only meant to keep the socket open. * Once $$$EOF$$$ is seen by the receiving peer, in it's own message, the receiving peer can respond: {type:'response', command_id='xxx', message:'stream_received', done:true} """ context = {'apikey': APIKey.load(SERVER_KEY_PATH), 'cluster': None} def authenticate(): token_to_sign = random_token() cmd = Command.new(ws, action='authenticate', token=token_to_sign) response = cmd.send() context['cluster'] = cluster = response['cluster'] client_pubkey = db.get_pub_key(cluster) client_apikey = APIKey(client_pubkey['pubkey']) # Verify the client correctly signed the token: try: client_apikey.verify_message(token_to_sign, response.get('signature')) except: response.respond(message='Bad Signature of token for authentication', done=True) log.error('client provided bad signature for auth token') raise response.respond(authenticated=True, done=True) # Client will ask us to authenticate too: command = receive_data(ws) assert command.get('action') == 'authenticate' data = {'signature' :context['apikey'].sign_message(command['token'])} response = command.respond(**data) if response.get('authenticated') != True: raise UnauthenticatedError("Our peer could not validate our signed auth token") def get_work(command): # Mark any existing in_process jobs for this cluster as # failed. If the cluster is asking for new work, then these # got dropped on the floor: for test in db.get_in_progress_tests(context['cluster']): db.update_test_status(test['test_id'], 'failed') # Find the next test scheduled for the client's cluster: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] else: # No tests are currently scheduled. # Register a zmq listener of notifications of incoming tests, with a timeout. # When we see any test scheduled notification for our cluster, redo the query. # If timeout reached, redo the query anyway in case we missed the notification. def setup_zmq(): zmq_context = zmq.Context() zmq_socket = zmq_context.socket(zmq.SUB) zmq_socket.connect('tcp://127.0.0.1:5557') zmq_socket.setsockopt_string( zmq.SUBSCRIBE, unicode('scheduled {cluster} '.format(cluster=context['cluster']))) zmq_socket.setsockopt(zmq.RCVTIMEO, 15000) return zmq_socket zmq_socket = setup_zmq() while True: try: cluster, test_id = zmq_socket.recv_string().split() except zmq.error.Again: pass except zmq.error.ZMQError, e: if e.errno == zmq.POLLERR: log.error(e) # Interrupted zmq socket code, reinitialize: # I get this when I resize my terminal.. WTF? zmq_socket = setup_zmq() finally: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] break else: # Send no-work-yet message: console_publish(context['cluster'], {'ctl':'WAIT'}) command.respond(action='wait', follow_up=False)
def cluster_comms(ws): """Websocket to communicate with the test clusters Commands are logical actions one end of the socket wishes the other end to take. Responses are follow ups to a command, which there can be multiple, back and forth between client and server until the receiving end marks a response as Done. Command structure: {type:'command', command_id:'some unique string for this command', message:'some message to the other end', action:'some action for the receiver to take', // Extra parameters: foo: ..., bar: ..., } Response structure: {type:'response', command_id:'the command id this is a response to', message:'some message to the other end', done: true/false (the responder considers the command complete) // Extra parameters: foo: ..., bar: ..., } Possible commands: * authenticate - server asks client to authenticate * get_work - client asks for a test * test_done - client is done with a test, and sending artifacts * cancel_test - server asks client to cancel test * shutdown - server asks client to shutdown service Protocol: Authentication: * client initiates connection to this server * server sends client a random challenge token {type:'command', command_id='zzzz', action:'authenticate', token:'xxxxxxx'} * client signs challenge token with it's private key ands sends the signature {type:'response', command_id='zzz', cluster:'bdplab', signature:'xxxxxxxx'} * server verifies the signature is against the token it sent and the public key it has on file for the cluster. * server sends a 'you are authenticated' response. {type:'response', command_id='zzz', authenticated: true, done:true} Task loop: * client sends a 'give me work' request. {type:'command', command_id='yyy', action:'get_work'} * server sends a 'ok, wait for work' response. {type:'response', command_id='yyy', action:'wait'} * server sends a single test to the cluster {type:'response', command_id='yyy', test:{...}} * client responds 'ok, received test' response {type:'response', command_id:'yyy', test_id:'xxxxxxx'} * server updates status of test to in_progress in database {type:'response', command_id:'yyy', message:'test_updated', done:true} * client sends artifacts via streaming protocol (See below) * client sends 'ok, test done, artifacts sent.' request. {type:'command', command_id:'llll', action:'test_done', test_id:'xxxxxxx'} * server updates status of test to completed * server sends a 'ok, test updated' response {type:'response', command_id:'llll', test_id:'xxxxxx', message='test_update', done:true} Streaming: protocol for streaming raw data: console output, binary artifacts etc. * Sending peer sends a "I'm going to send binary data to you" request: {type:'command', command_id='xxx', action:'stream', test_id='xxxxx', kind:"[console|failure|chart|system_logs|stress_logs]", name='name', eof='$$$EOF$$$', keepalive='$$$KEEPALIVE$$$'} * Receiving peer sends response indicating it's ready to receive the stream: {type:'response', command_id='xxx', action='ready'} * Peer starts sending arbitrary binary data messages. * The receiving peer reads binary data. If it encounters $$$KEEPALIVE$$$ as it's own message, it will omit that data, as it's only meant to keep the socket open. * Once $$$EOF$$$ is seen by the receiving peer, in it's own message, the receiving peer can respond: {type:'response', command_id='xxx', message:'stream_received', done:true} """ context = {'apikey': APIKey.load(SERVER_KEY_PATH), 'cluster': None} def authenticate(): token_to_sign = random_token() cmd = Command.new(ws, action='authenticate', token=token_to_sign) response = cmd.send() context['cluster'] = cluster = response['cluster'] client_pubkey = db.get_pub_key(cluster) client_apikey = APIKey(client_pubkey['pubkey']) # Verify the client correctly signed the token: try: client_apikey.verify_message(token_to_sign, response.get('signature')) except: response.respond( message='Bad Signature of token for authentication', done=True) log.error('client provided bad signature for auth token') raise response.respond(authenticated=True, done=True) # Client will ask us to authenticate too: command = receive_data(ws) assert command.get('action') == 'authenticate' data = {'signature': context['apikey'].sign_message(command['token'])} response = command.respond(**data) if response.get('authenticated') != True: raise UnauthenticatedError( "Our peer could not validate our signed auth token") def get_work(command): # Mark any existing in_process jobs for this cluster as # failed. If the cluster is asking for new work, then these # got dropped on the floor: for test in db.get_in_progress_tests(context['cluster']): db.update_test_status(test['test_id'], 'failed') # Find the next test scheduled for the client's cluster: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] else: # No tests are currently scheduled. # Register a zmq listener of notifications of incoming tests, with a timeout. # When we see any test scheduled notification for our cluster, redo the query. # If timeout reached, redo the query anyway in case we missed the notification. def setup_zmq(): zmq_context = zmq.Context() zmq_socket = zmq_context.socket(zmq.SUB) zmq_socket.connect('tcp://127.0.0.1:5557') zmq_socket.setsockopt_string( zmq.SUBSCRIBE, unicode('scheduled {cluster} '.format( cluster=context['cluster']))) zmq_socket.setsockopt(zmq.RCVTIMEO, 15000) return zmq_socket zmq_socket = setup_zmq() while True: try: cluster, test_id = zmq_socket.recv_string().split() except zmq.error.Again: pass except zmq.error.ZMQError, e: if e.errno == zmq.POLLERR: log.error(e) # Interrupted zmq socket code, reinitialize: # I get this when I resize my terminal.. WTF? zmq_socket = setup_zmq() finally: tests = db.get_scheduled_tests(context['cluster'], limit=1) if len(tests) > 0: test_id = tests[0]['test_id'] break else: # Send no-work-yet message: console_publish(context['cluster'], {'ctl': 'WAIT'}) command.respond(action='wait', follow_up=False)