Esempio n. 1
0
 def __init__(self, queue, leader, replica_factor=2, replica_offset=5, interface=None, 
     port=6001, cluster_port=6000):
     if interface is None:
         interface = socket.gethostbyname(socket.gethostname())
     self.interface = interface
     self.port = port
     
     self.dispatcher = DispatchClient(interface, self._dispatcher_event)
     self.cluster = ClusterManager(leader, callback=self._cluster_update, 
                         interface=interface, port=cluster_port)
     self.backend = gevent.server.StreamServer((interface, port), self._backend_server)
     self.peers = set()
     self.connections = {}
     
     self.queue = queue
     self.scheduled = {}
     self.scheduled_acks = {}
     self.schedules = 0
     
     self.replica_factor = replica_factor
     self.replica_offset = replica_offset
Esempio n. 2
0
class DistributedScheduler(object):
    def __init__(self, queue, leader, replica_factor=2, replica_offset=5, interface=None, 
        port=6001, cluster_port=6000):
        if interface is None:
            interface = socket.gethostbyname(socket.gethostname())
        self.interface = interface
        self.port = port
        
        self.dispatcher = DispatchClient(interface, self._dispatcher_event)
        self.cluster = ClusterManager(leader, callback=self._cluster_update, 
                            interface=interface, port=cluster_port)
        self.backend = gevent.server.StreamServer((interface, port), self._backend_server)
        self.peers = set()
        self.connections = {}
        
        self.queue = queue
        self.scheduled = {}
        self.scheduled_acks = {}
        self.schedules = 0
        
        self.replica_factor = replica_factor
        self.replica_offset = replica_offset

        
    def start(self):
        self.dispatcher.start()
        self.backend.start()
        self.cluster.start()
    
    def schedule(self, task):
        host_list = list(self.peers)
        # This implements the round-robin N replication method for picking
        # which hosts to send the task. In short, every schedule moves along the
        # cluster ring by one, then picks N hosts, where N is level of replication
        replication_factor = min(self.replica_factor, len(host_list))
        host_ids = [(self.schedules + n) % len(host_list) for n in xrange(replication_factor)]
        hosts = [host_list[id] for id in host_ids]
        task.replica_hosts = hosts
        self.scheduled_acks[task.id] = gevent.queue.Queue()
        for host in hosts:
            self.connections[host].send('schedule:%s\n' % task.serialize())
            task.replica_offset += self.replica_offset
        try:
            # TODO: document, wrap this whole operation in timeout
            return all([self.scheduled_acks[task.id].get(timeout=2) for h in hosts])
        except gevent.queue.Empty:
            raise ScheduleError("not all hosts acked")
        finally:
            self.schedules += 1 
            self.scheduled_acks.pop(task.id)

    
    def _cluster_update(self, hosts):
        add_hosts = hosts - self.peers
        remove_hosts = self.peers - hosts
        for host in remove_hosts:
            print "disconnecting from peer %s" % host
            gevent.spawn(self._remove_peer, host)
        for host in add_hosts:
            print "connecting to peer %s" % (host)
            gevent.spawn(self._add_peer, host)
        self.peers = hosts
    
    def _add_peer(self, host):
        client = gevent.socket.create_connection((host, self.port), source_address=(self.interface, 0))
        self.connections[host] = client
        for line in util.line_protocol(client):
            ack, task_id = line.split(':', 1)
            if ack == 'scheduled' and task_id in self.scheduled_acks:
                self.scheduled_acks[task_id].put(True)
        print "disconnected from peer %s" % host
        self._remove_peer(host)
    
    def _remove_peer(self, host):
        if host in self.connections:
            peer = self.connections.pop(host)
            try:
                peer.shutdown(0)
            except:
                pass

    def _dispatcher_event(self, event, payload):
        if event == 'start':
            task = self.scheduled[payload]
            eta = int(time.time() + constants.WORKER_TIMEOUT)
            self._sendto_replicas(task, 'reschedule:%s:%s\n' % (task.id, eta))
        
        elif event == 'success':
            task = self.scheduled[payload]
            self._sendto_replicas(task, 'cancel:%s\n' % task.id)
            self.scheduled.pop(task.id)
        
        elif event == 'failure':
            task_id, reason = payload.split(':', 1)
            self.scheduled.pop(task_id)
            print "FAILURE %s: %s" % (task_id, reason)
    
    def _sendto_replicas(self, task, message):
        other_replica_hosts = set(task.replica_hosts) - set([self.interface])
        for host in other_replica_hosts:
            if host in self.connections:
                self.connections[host].send(message)
    
    def _backend_server(self, socket, address):
        for line in util.line_protocol(socket):
            action, payload = line.split(':', 1)
            
            if action == 'schedule':
                task = Task.unserialize(payload)
                task.schedule(self.dispatcher)
                self.scheduled[task.id] = task
                socket.send('scheduled:%s\n' % task.id)
                print "scheduled: %s" % task.id
            
            elif action == 'cancel':
                task_id = payload
                print "canceled: %s" % task_id
                self.scheduled.pop(task_id).cancel()
            
            elif action == 'reschedule':
                task_id, eta = payload.split(':', 1)
                eta = int(eta)
                print "rescheduled: %s for %s" % (task_id, eta)
                self.scheduled[task_id].reschedule(self.dispatcher, eta)
Esempio n. 3
0
import zmq
import time
import threading
import json
import socket
from cluster import ClusterManager, Node

def stop_all ():
    clusterManager.stop ()
    ctx.term ()

def print_status ():
    print "Local node: {}, running: {}, connected: {}".format (clusterManager.node.name, clusterManager.running (), clusterManager.connected ())
    print "Connected Nodes: {}".format (clusterManager.nodes)

ctx = zmq.Context (1)
clusterManager = ClusterManager (ctx, 5560, 5570)
clusterManager.start ()