Ejemplo n.º 1
0
    def __init__(self, networkmgr, etcdclient, addr, mode):
        self.addr = addr
        logger.info ("begin initialize on %s" % self.addr)
        self.networkmgr = networkmgr
        self.etcd = etcdclient
        self.mode = mode

        # initialize the network
        logger.info ("initialize network")

        # 'docklet-br' not need ip address. Because every user has gateway
        #[status, result] = self.networkmgr.acquire_sysips_cidr()
        #self.networkmgr.printpools()
        #if not status:
        #    logger.info ("initialize network failed, no IP for system bridge")
        #    sys.exit(1)
        #self.bridgeip = result[0]
        #logger.info ("initialize bridge wih ip %s" % self.bridgeip)
        #network.netsetup("init", self.bridgeip)

        if self.mode == 'new':
            if netcontrol.bridge_exists('docklet-br'):
                netcontrol.del_bridge('docklet-br')
            netcontrol.new_bridge('docklet-br')
        else:
            if not netcontrol.bridge_exists('docklet-br'):
                logger.error("docklet-br not found")
                sys.exit(1)

        # get allnodes
        self.allnodes = self._nodelist_etcd("allnodes")
        self.runnodes = self._nodelist_etcd("runnodes")
        logger.info ("all nodes are: %s" % self.allnodes)
        logger.info ("run nodes are: %s" % self.runnodes)
        if len(self.runnodes)>0:
            logger.error ("init runnodes is not null, need to be clean")
            sys.exit(1)
        # init rpc list 
        self.rpcs = []
        # start new thread to watch whether a new node joins
        logger.info ("start thread to watch new nodes ...")
        self.thread_watchnewnode = threading.Thread(target=self._watchnewnode)
        self.thread_watchnewnode.start()
        # wait for all nodes joins 
        while(True):
            allin = True
            for node in self.allnodes:
                if node not in self.runnodes:
                    allin = False
                    break
            if allin:
                logger.info("all nodes necessary joins ...")
                break
            time.sleep(0.05)
        logger.info ("run nodes are: %s" % self.runnodes)
Ejemplo n.º 2
0
    def __init__(self, networkmgr, etcdclient, addr, mode):
        self.addr = addr
        logger.info("begin initialize on %s" % self.addr)
        self.networkmgr = networkmgr
        self.etcd = etcdclient
        self.mode = mode

        # initialize the network
        logger.info("initialize network")

        # 'docklet-br' not need ip address. Because every user has gateway
        #[status, result] = self.networkmgr.acquire_sysips_cidr()
        #self.networkmgr.printpools()
        #if not status:
        #    logger.info ("initialize network failed, no IP for system bridge")
        #    sys.exit(1)
        #self.bridgeip = result[0]
        #logger.info ("initialize bridge wih ip %s" % self.bridgeip)
        #network.netsetup("init", self.bridgeip)

        if self.mode == 'new':
            if netcontrol.bridge_exists('docklet-br'):
                netcontrol.del_bridge('docklet-br')
            netcontrol.new_bridge('docklet-br')
        else:
            if not netcontrol.bridge_exists('docklet-br'):
                logger.error("docklet-br not found")
                sys.exit(1)

        # get allnodes
        self.allnodes = self._nodelist_etcd("allnodes")
        self.runnodes = self._nodelist_etcd("runnodes")
        logger.info("all nodes are: %s" % self.allnodes)
        logger.info("run nodes are: %s" % self.runnodes)
        if len(self.runnodes) > 0:
            logger.error("init runnodes is not null, need to be clean")
            sys.exit(1)
        # init rpc list
        self.rpcs = []
        # start new thread to watch whether a new node joins
        logger.info("start thread to watch new nodes ...")
        self.thread_watchnewnode = threading.Thread(target=self._watchnewnode)
        self.thread_watchnewnode.start()
        # wait for all nodes joins
        while (True):
            allin = True
            for node in self.allnodes:
                if node not in self.runnodes:
                    allin = False
                    break
            if allin:
                logger.info("all nodes necessary joins ...")
                break
            time.sleep(0.05)
        logger.info("run nodes are: %s" % self.runnodes)
Ejemplo n.º 3
0
 def del_usrgwbr(self, username, uid, nodemgr):
     if username not in self.usrgws.keys():
         return [False, "user does't have gateway or user doesn't exist."]
     ip = self.usrgws[username]
     logger.info("Delete user %s(%s) gateway on %s" %(username, str(uid), ip))
     if ip == self.masterip:
         netcontrol.del_gw('docklet-br-'+str(uid), username)
         netcontrol.del_bridge('docklet-br-'+str(uid))
     else:
         worker = nodemgr.ip_to_rpc(ip)
         worker.del_gw('docklet-br-'+str(uid), username)
         worker.del_bridge('docklet-br-'+str(uid))
     del self.usrgws[username]
     self.etcd.delkey("network/usrgws/"+username)
     return [True, 'delete user\' gateway success']
Ejemplo n.º 4
0
 def del_usrgwbr(self, username, uid, nodemgr):
     if username not in self.usrgws.keys():
         return [False, "user does't have gateway or user doesn't exist."]
     ip = self.usrgws[username]
     logger.info("Delete user %s(%s) gateway on %s" %(username, str(uid), ip))
     if ip == self.masterip:
         netcontrol.del_gw('docklet-br-'+str(uid), username)
         netcontrol.del_bridge('docklet-br-'+str(uid))
     else:
         worker = nodemgr.ip_to_rpc(ip)
         worker.del_gw('docklet-br-'+str(uid), username)
         worker.del_bridge('docklet-br-'+str(uid))
     del self.usrgws[username]
     self.etcd.delkey("network/usrgws/"+username)
     return [True, 'delete user\' gateway success']
Ejemplo n.º 5
0
 def check_switch(self, switchid):
     switchid = int(switchid)
     if switchid not in self.__switches and self.__mode == 'new' and netcontrol.bridge_exists(switchid):
         [sts, _] = netcontrol.del_bridge(switchid)
         if not sts:
             return [False, "del virtual switch failed with mode new"]
     if not netcontrol.bridge_exists(switchid):
         [sts, _] = netcontrol.new_bridge(switchid)
         if not sts:
             return [False, "add virtual switch failed"]
     if switchid not in self.__switches:
         self.__switches.append(switchid)
     return [True, "check virtual switch okay"]
Ejemplo n.º 6
0
 def check_switch_with_gre(self, switchid):
     if self.__addr == self.__master:
         return True
     switchid = int(switchid)
     if switchid not in self.__switches and self.__mode == 'new' and netcontrol.bridge_exists(switchid):
         [sts, _] = netcontrol.del_bridge(switchid)
         if not sts:
             return [False, "del virtual switch failed with mode new"]
     if not netcontrol.bridge_exists(switchid):
         [sts, _] = netcontrol.new_bridge(switchid)
         if not sts:
             return [False, "add virtual switch failed"]
     if switchid not in self.__switches:
         self.__switches.append(switchid)
     if not netcontrol.gre_exists(switchid, self.__master):
         [sts, _] = netcontrol.setup_gre(switchid, self.__master, switchid)
         if not sts:
             return [False, "setup gre to master failed"]
     return [True, "check virtual switch okay"]
Ejemplo n.º 7
0
    def __init__(self, networkmgr, etcdclient, addr, mode):
        self.addr = addr
        logger.info ("begin initialize on %s" % self.addr)
        self.networkmgr = networkmgr
        self.etcd = etcdclient
        self.mode = mode
        self.workerport = env.getenv('WORKER_PORT')

        # initialize the network
        logger.info ("initialize network")

        # 'docklet-br' not need ip address. Because every user has gateway
        #[status, result] = self.networkmgr.acquire_sysips_cidr()
        #self.networkmgr.printpools()
        #if not status:
        #    logger.info ("initialize network failed, no IP for system bridge")
        #    sys.exit(1)
        #self.bridgeip = result[0]
        #logger.info ("initialize bridge wih ip %s" % self.bridgeip)
        #network.netsetup("init", self.bridgeip)

        if self.mode == 'new':
            if netcontrol.bridge_exists('docklet-br'):
                netcontrol.del_bridge('docklet-br')
            netcontrol.new_bridge('docklet-br')
        else:
            if not netcontrol.bridge_exists('docklet-br'):
                logger.error("docklet-br not found")
                sys.exit(1)

        # init rpc list 
        self.rpcs = []

        # get allnodes
        # used in recovery mode, find alll the lost running nodes
        self.allnodes = self._nodelist_etcd("allnodes")
        self.runnodes = []
        [status, runlist] = self.etcd.listdir("machines/runnodes")
        for node in runlist:
            nodeip = node['key'].rsplit('/',1)[1]
            if node['value'] == 'ok':
                logger.info ("running node %s" % nodeip)
                self.runnodes.append(nodeip)
                self.rpcs.append(xmlrpc.client.ServerProxy("http://%s:%s" % (nodeip, self.workerport)))
                
                logger.info ("add %s:%s in rpc client list" % (nodeip, self.workerport))

        logger.info ("all nodes are: %s" % self.allnodes)
        logger.info ("run nodes are: %s" % self.runnodes)

        # start new thread to watch whether a new node joins
        logger.info ("start thread to watch new nodes ...")
        self.thread_watchnewnode = threading.Thread(target=self._watchnewnode)
        self.thread_watchnewnode.start()
        # wait for all nodes joins
        while(True):
            allin = True
            for node in self.allnodes:
                if node not in self.runnodes:
                    allin = False
                    break
            if allin:
                logger.info("all nodes necessary joins ...")
                break
            time.sleep(0.05)
        logger.info ("run nodes are: %s" % self.runnodes)
Ejemplo n.º 8
0
    def __init__(self, etcdclient, addr, port):
        self.addr = addr
        self.port = port
        logger.info ("begin initialize on %s" % self.addr)

        self.fspath = env.getenv('FS_PREFIX')
        self.poolsize = env.getenv('DISKPOOL_SIZE')

        self.etcd = etcdclient
        self.master = self.etcd.getkey("service/master")[1]
        self.mode=None

        # waiting state is preserved for compatible.
        self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
        # get this node's key to judge how to init.
        [status, key] = self.etcd.getkey("machines/runnodes/"+self.addr)
        if status:
            self.key = generatekey("machines/allnodes/"+self.addr)
        else:
            logger.error("get key failed. %s" % 'machines/runnodes/'+self.addr)
            sys.exit(1)

        # check token to check global directory
        [status, token_1] = self.etcd.getkey("token")
        tokenfile = open(self.fspath+"/global/token", 'r')
        token_2 = tokenfile.readline().strip()
        if token_1 != token_2:
            logger.error("check token failed, global directory is not a shared filesystem")
            sys.exit(1)
        logger.info ("worker registered and checked the token")

        # worker search all run nodes to judge how to init
        # If the node in all node list, we will recover it.
        # Otherwise, this node is new added in.
        value = 'init-new'
        [status, alllist] = self.etcd.listdir("machines/allnodes")
        for node in alllist:
            if node['key'] == self.key:
                value = 'init-recovery'
                break
        logger.info("worker start in "+value+" mode")

        Containers = container.Container(self.addr, etcdclient)
        if value == 'init-new':
            logger.info ("init worker with mode:new")
            self.mode='new'
            # check global directory do not have containers on this worker
            [both, onlylocal, onlyglobal] = Containers.diff_containers()
            if len(both+onlyglobal) > 0:
                logger.error ("mode:new will clean containers recorded in global, please check")
                sys.exit(1)
            [status, info] = Containers.delete_allcontainers()
            if not status:
                logger.error ("delete all containers failed")
                sys.exit(1)
            # create new lvm VG at last
            new_group("docklet-group",self.poolsize,self.fspath+"/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "new", "group", "docklet-group", self.poolsize, self.fspath+"/local/docklet-storage"])
        elif value == 'init-recovery':
            logger.info ("init worker with mode:recovery")
            self.mode='recovery'
            # recover lvm VG first
            recover_group("docklet-group",self.fspath+"/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "recover", "group", "docklet-group", self.fspath+"/local/docklet-storage"])
            [status, meg] = Containers.check_allcontainers()
            if status:
                logger.info ("all containers check ok")
            else:
                logger.info ("not all containers check ok")
                #sys.exit(1)
        else:
            logger.error ("worker init mode:%s not supported" % value)
            sys.exit(1)
        # initialize rpc
        # xmlrpc.server.SimpleXMLRPCServer(addr) -- addr : (ip-addr, port)
        # if ip-addr is "", it will listen ports of all IPs of this host
        logger.info ("initialize rpcserver %s:%d" % (self.addr, int(self.port)))
        # logRequests=False : not print rpc log
        #self.rpcserver = xmlrpc.server.SimpleXMLRPCServer((self.addr, self.port), logRequests=False)
        self.rpcserver = ThreadXMLRPCServer((self.addr, int(self.port)), allow_none=True, logRequests=False)
        self.rpcserver.register_introspection_functions()
        self.rpcserver.register_instance(Containers)
        self.rpcserver.register_function(monitor.workerFetchInfo)
        # register functions or instances to server for rpc
        #self.rpcserver.register_function(function_name)

        # init collector to collect monitor infomation
        self.con_collector = monitor.Container_Collector()
        self.hosts_collector = monitor.Collector()

        # initialize the network
        # if worker and master run on the same node, reuse bridges
        #                     don't need to create new bridges
        if (self.addr == self.master):
            logger.info ("master also on this node. reuse master's network")
        else:
            logger.info ("initialize network")
            # 'docklet-br' of worker do not need IP Addr. 
            #[status, result] = self.etcd.getkey("network/workbridge")
            #if not status:
            #    logger.error ("get bridge IP failed, please check whether master set bridge IP for worker")
            #self.bridgeip = result
            # create bridges for worker
            #network.netsetup("init", self.bridgeip)
            if self.mode == 'new':
                if netcontrol.bridge_exists('docklet-br'):
                    netcontrol.del_bridge('docklet-br')
                netcontrol.new_bridge('docklet-br')
            else:
                if not netcontrol.bridge_exists('docklet-br'):
                    logger.error("docklet-br not found")
                    sys.exit(1)
            logger.info ("setup GRE tunnel to master %s" % self.master)
            #network.netsetup("gre", self.master)
            if not netcontrol.gre_exists('docklet-br', self.master):
                netcontrol.setup_gre('docklet-br', self.master)
Ejemplo n.º 9
0
    def __init__(self, etcdclient, addr, port):
        self.addr = addr
        self.port = port
        logger.info("begin initialize on %s" % self.addr)

        self.fspath = env.getenv('FS_PREFIX')
        self.poolsize = env.getenv('DISKPOOL_SIZE')

        self.etcd = etcdclient
        self.master = self.etcd.getkey("service/master")[1]
        self.mode = None

        # waiting state is preserved for compatible.
        self.etcd.setkey("machines/runnodes/" + self.addr, "waiting")
        # get this node's key to judge how to init.
        [status, key] = self.etcd.getkey("machines/runnodes/" + self.addr)
        if status:
            self.key = generatekey("machines/allnodes/" + self.addr)
        else:
            logger.error("get key failed. %s" % 'machines/runnodes/' +
                         self.addr)
            sys.exit(1)

        # check token to check global directory
        [status, token_1] = self.etcd.getkey("token")
        tokenfile = open(self.fspath + "/global/token", 'r')
        token_2 = tokenfile.readline().strip()
        if token_1 != token_2:
            logger.error(
                "check token failed, global directory is not a shared filesystem"
            )
            sys.exit(1)
        logger.info("worker registered and checked the token")

        # worker search all run nodes to judge how to init
        # If the node in all node list, we will recover it.
        # Otherwise, this node is new added in.
        value = 'init-new'
        [status, alllist] = self.etcd.listdir("machines/allnodes")
        for node in alllist:
            if node['key'] == self.key:
                value = 'init-recovery'
                break
        logger.info("worker start in " + value + " mode")

        Containers = container.Container(self.addr, etcdclient)
        if value == 'init-new':
            logger.info("init worker with mode:new")
            self.mode = 'new'
            # check global directory do not have containers on this worker
            [both, onlylocal, onlyglobal] = Containers.diff_containers()
            if len(both + onlyglobal) > 0:
                logger.error(
                    "mode:new will clean containers recorded in global, please check"
                )
                sys.exit(1)
            [status, info] = Containers.delete_allcontainers()
            if not status:
                logger.error("delete all containers failed")
                sys.exit(1)
            # create new lvm VG at last
            new_group("docklet-group", self.poolsize,
                      self.fspath + "/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "new", "group", "docklet-group", self.poolsize, self.fspath+"/local/docklet-storage"])
        elif value == 'init-recovery':
            logger.info("init worker with mode:recovery")
            self.mode = 'recovery'
            # recover lvm VG first
            recover_group("docklet-group",
                          self.fspath + "/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "recover", "group", "docklet-group", self.fspath+"/local/docklet-storage"])
            [status, meg] = Containers.check_allcontainers()
            if status:
                logger.info("all containers check ok")
            else:
                logger.info("not all containers check ok")
                #sys.exit(1)
        else:
            logger.error("worker init mode:%s not supported" % value)
            sys.exit(1)
        # initialize rpc
        # xmlrpc.server.SimpleXMLRPCServer(addr) -- addr : (ip-addr, port)
        # if ip-addr is "", it will listen ports of all IPs of this host
        logger.info("initialize rpcserver %s:%d" % (self.addr, int(self.port)))
        # logRequests=False : not print rpc log
        #self.rpcserver = xmlrpc.server.SimpleXMLRPCServer((self.addr, self.port), logRequests=False)
        self.rpcserver = ThreadXMLRPCServer((self.addr, int(self.port)),
                                            allow_none=True,
                                            logRequests=False)
        self.rpcserver.register_introspection_functions()
        self.rpcserver.register_instance(Containers)
        self.rpcserver.register_function(monitor.workerFetchInfo)
        # register functions or instances to server for rpc
        #self.rpcserver.register_function(function_name)

        # initialize the network
        # if worker and master run on the same node, reuse bridges
        #                     don't need to create new bridges
        if (self.addr == self.master):
            logger.info("master also on this node. reuse master's network")
        else:
            logger.info("initialize network")
            # 'docklet-br' of worker do not need IP Addr.
            #[status, result] = self.etcd.getkey("network/workbridge")
            #if not status:
            #    logger.error ("get bridge IP failed, please check whether master set bridge IP for worker")
            #self.bridgeip = result
            # create bridges for worker
            #network.netsetup("init", self.bridgeip)
            if self.mode == 'new':
                if netcontrol.bridge_exists('docklet-br'):
                    netcontrol.del_bridge('docklet-br')
                netcontrol.new_bridge('docklet-br')
            else:
                if not netcontrol.bridge_exists('docklet-br'):
                    logger.error("docklet-br not found")
                    sys.exit(1)
            logger.info("setup GRE tunnel to master %s" % self.master)
            #network.netsetup("gre", self.master)
            if not netcontrol.gre_exists('docklet-br', self.master):
                netcontrol.setup_gre('docklet-br', self.master)
Ejemplo n.º 10
0
    def __init__(self, etcdclient, addr, port):
        self.addr = addr
        self.port = port
        logger.info ("begin initialize on %s" % self.addr)

        self.fspath = env.getenv('FS_PREFIX')
        self.poolsize = env.getenv('DISKPOOL_SIZE')

        self.etcd = etcdclient
        self.master = self.etcd.getkey("service/master")[1]
        self.mode=None

        # register self to master
        self.etcd.setkey("machines/runnodes/"+self.addr, "waiting")
        for f in range (0, 3):
            [status, value] = self.etcd.getkey("machines/runnodes/"+self.addr)
            if not value.startswith("init"):
                # master wakesup every 0.1s  to check register
                logger.debug("worker % register to master failed %d \
                        time, sleep %fs" % (self.addr, f+1, 0.1))
                time.sleep(0.1)
            else:
                break

        if value.startswith("init"):
            # check token to check global directory
            [status, token_1] = self.etcd.getkey("token")
            tokenfile = open(self.fspath+"/global/token", 'r')
            token_2 = tokenfile.readline().strip()
            if token_1 != token_2:
                logger.error("check token failed, global directory is not a shared filesystem")
                sys.exit(1)
        else:
            logger.error ("worker register in machines/runnodes failed, maybe master not start")
            sys.exit(1)
        logger.info ("worker registered in master and checked the token")

        Containers = container.Container(self.addr, etcdclient)
        if value == 'init-new':
            logger.info ("init worker with mode:new")
            self.mode='new'
            # check global directory do not have containers on this worker
            [both, onlylocal, onlyglobal] = Containers.diff_containers()
            if len(both+onlyglobal) > 0:
                logger.error ("mode:new will clean containers recorded in global, please check")
                sys.exit(1)
            [status, info] = Containers.delete_allcontainers()
            if not status:
                logger.error ("delete all containers failed")
                sys.exit(1)
            # create new lvm VG at last
            new_group("docklet-group",self.poolsize,self.fspath+"/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "new", "group", "docklet-group", self.poolsize, self.fspath+"/local/docklet-storage"])
        elif value == 'init-recovery':
            logger.info ("init worker with mode:recovery")
            self.mode='recovery'
            # recover lvm VG first
            recover_group("docklet-group",self.fspath+"/local/docklet-storage")
            #subprocess.call([self.libpath+"/lvmtool.sh", "recover", "group", "docklet-group", self.fspath+"/local/docklet-storage"])
            [status, meg] = Containers.check_allcontainers()
            if status:
                logger.info ("all containers check ok")
            else:
                logger.info ("not all containers check ok")
                #sys.exit(1)
        else:
            logger.error ("worker init mode:%s not supported" % value)
            sys.exit(1)
        # initialize rpc
        # xmlrpc.server.SimpleXMLRPCServer(addr) -- addr : (ip-addr, port)
        # if ip-addr is "", it will listen ports of all IPs of this host
        logger.info ("initialize rpcserver %s:%d" % (self.addr, int(self.port)))
        # logRequests=False : not print rpc log
        #self.rpcserver = xmlrpc.server.SimpleXMLRPCServer((self.addr, self.port), logRequests=False)
        self.rpcserver = ThreadXMLRPCServer((self.addr, int(self.port)), allow_none=True)
        self.rpcserver.register_introspection_functions()
        self.rpcserver.register_instance(Containers)
        # register functions or instances to server for rpc
        #self.rpcserver.register_function(function_name)

        # initialize the network
        # if worker and master run on the same node, reuse bridges
        #                     don't need to create new bridges
        if (self.addr == self.master):
            logger.info ("master also on this node. reuse master's network")
        else:
            logger.info ("initialize network")
            # 'docklet-br' of worker do not need IP Addr. 
            #[status, result] = self.etcd.getkey("network/workbridge")
            #if not status:
            #    logger.error ("get bridge IP failed, please check whether master set bridge IP for worker")
            #self.bridgeip = result
            # create bridges for worker
            #network.netsetup("init", self.bridgeip)
            if self.mode == 'new':
                if netcontrol.bridge_exists('docklet-br'):
                    netcontrol.del_bridge('docklet-br')
                netcontrol.new_bridge('docklet-br')
            else:
                if not netcontrol.bridge_exists('docklet-br'):
                    logger.error("docklet-br not found")
                    sys.exit(1)
            logger.info ("setup GRE tunnel to master %s" % self.master)
            #network.netsetup("gre", self.master)
            if not netcontrol.gre_exists('docklet-br', self.master):
                netcontrol.setup_gre('docklet-br', self.master)
Ejemplo n.º 11
0
    def __init__(self, networkmgr, etcdclient, addr, mode):
        self.addr = addr
        logger.info ("begin initialize on %s" % self.addr)
        self.networkmgr = networkmgr
        self.etcd = etcdclient
        self.mode = mode
        self.workerport = env.getenv('WORKER_PORT')

        # initialize the network
        logger.info ("initialize network")

        # 'docklet-br' not need ip address. Because every user has gateway
        #[status, result] = self.networkmgr.acquire_sysips_cidr()
        #self.networkmgr.printpools()
        #if not status:
        #    logger.info ("initialize network failed, no IP for system bridge")
        #    sys.exit(1)
        #self.bridgeip = result[0]
        #logger.info ("initialize bridge wih ip %s" % self.bridgeip)
        #network.netsetup("init", self.bridgeip)

        if self.mode == 'new':
            if netcontrol.bridge_exists('docklet-br'):
                netcontrol.del_bridge('docklet-br')
            netcontrol.new_bridge('docklet-br')
        else:
            if not netcontrol.bridge_exists('docklet-br'):
                logger.error("docklet-br not found")
                sys.exit(1)

        # init rpc list 
        self.rpcs = []

        # get allnodes
        self.allnodes = self._nodelist_etcd("allnodes")
        self.runnodes = []
        [status, runlist] = self.etcd.listdir("machines/runnodes")
        for node in runlist:
            nodeip = node['key'].rsplit('/',1)[1]
            if node['value'] == 'ok':
                logger.info ("running node %s" % nodeip)
                self.runnodes.append(nodeip)
                self.rpcs.append(xmlrpc.client.ServerProxy("http://%s:%s" % (nodeip, self.workerport)))
                logger.info ("add %s:%s in rpc client list" % (nodeip, self.workerport))
           
        logger.info ("all nodes are: %s" % self.allnodes)
        logger.info ("run nodes are: %s" % self.runnodes)

        # start new thread to watch whether a new node joins
        logger.info ("start thread to watch new nodes ...")
        self.thread_watchnewnode = threading.Thread(target=self._watchnewnode)
        self.thread_watchnewnode.start()
        # wait for all nodes joins
        while(True):
            allin = True
            for node in self.allnodes:
                if node not in self.runnodes:
                    allin = False
                    break
            if allin:
                logger.info("all nodes necessary joins ...")
                break
            time.sleep(0.05)
        logger.info ("run nodes are: %s" % self.runnodes)