def __init__(self): rpc_pb2_grpc.WorkerServicer.__init__(self) etcdaddr = env.getenv("ETCD") logger.info("using ETCD %s" % etcdaddr) clustername = env.getenv("CLUSTER_NAME") logger.info("using CLUSTER_NAME %s" % clustername) # init etcdlib client try: self.etcdclient = etcdlib.Client(etcdaddr, prefix=clustername) except Exception: logger.error( "connect etcd failed, maybe etcd address not correct...") sys.exit(1) else: logger.info("etcd connected") # get master ip and report port [success, masterip] = self.etcdclient.getkey("service/master") if not success: logger.error("Fail to get master ip address.") sys.exit(1) else: self.master_ip = masterip logger.info("Get master ip address: %s" % (self.master_ip)) self.master_port = env.getenv('BATCH_MASTER_PORT') self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') self.taskmsgs = [] self.msgslock = threading.Lock() self.report_interval = 2 self.lock = threading.Lock() self.mount_lock = threading.Lock() self.cons_gateway = env.getenv('BATCH_GATEWAY') self.cons_ips = env.getenv('BATCH_NET') logger.info("Batch gateway ip address %s" % self.cons_gateway) logger.info("Batch ip pools %s" % self.cons_ips) self.cidr = 32 - int(self.cons_ips.split('/')[1]) self.ipbase = ip_to_int(self.cons_ips.split('/')[0]) self.free_ips = [] for i in range(2, (1 << self.cidr) - 1): self.free_ips.append(i) logger.info("Free ip addresses pool %s" % str(self.free_ips)) self.gpu_lock = threading.Lock() self.gpu_status = {} gpus = gputools.get_gpu_status() for gpu in gpus: self.gpu_status[gpu['id']] = "" self.start_report() logger.info('TaskController init success')
def __init__(self): rpc_pb2_grpc.WorkerServicer.__init__(self) etcdaddr = env.getenv("ETCD") logger.info ("using ETCD %s" % etcdaddr ) clustername = env.getenv("CLUSTER_NAME") logger.info ("using CLUSTER_NAME %s" % clustername ) # init etcdlib client try: self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername) except Exception: logger.error ("connect etcd failed, maybe etcd address not correct...") sys.exit(1) else: logger.info("etcd connected") # get master ip and report port [success,masterip] = self.etcdclient.getkey("service/master") if not success: logger.error("Fail to get master ip address.") sys.exit(1) else: self.master_ip = masterip logger.info("Get master ip address: %s" % (self.master_ip)) self.master_port = env.getenv('BATCH_MASTER_PORT') # get worker ip self.worker_ip = getip(env.getenv('NETWORK_DEVICE')) logger.info("Worker ip is :%s"%self.worker_ip) self.imgmgr = imagemgr.ImageMgr() self.fspath = env.getenv('FS_PREFIX') self.confpath = env.getenv('DOCKLET_CONF') self.rm_all_batch_containers() self.taskmsgs = [] self.msgslock = threading.Lock() self.report_interval = 2 self.lock = threading.Lock() self.mount_lock = threading.Lock() self.gpu_lock = threading.Lock() self.gpu_status = {} gpus = gputools.get_gpu_status() for gpu in gpus: self.gpu_status[gpu['id']] = "" self.start_report() logger.info('TaskWorker init success')
def __init__(self, addr, etcdclient): self.addr = addr self.etcd = etcdclient self.libpath = env.getenv('DOCKLET_LIB') self.confpath = env.getenv('DOCKLET_CONF') self.fspath = env.getenv('FS_PREFIX') # set jupyter running dir in container self.rundir = "/home/jupyter" # set root running dir in container self.nodehome = "/root" self.lxcpath = "/var/lib/lxc" self.imgmgr = imagemgr.ImageMgr() self.historymgr = History_Manager()
def __init__(self, nodemgr, networkmgr, etcdclient, addr, mode, distributedgw='False'): self.mode = mode self.distributedgw = distributedgw self.nodemgr = nodemgr self.imgmgr = imagemgr.ImageMgr() self.networkmgr = networkmgr self.addr = addr self.etcd = etcdclient self.defaultsize = env.getenv("CLUSTER_SIZE") self.fspath = env.getenv("FS_PREFIX") self.clusterid_locks = threading.Lock() # check database try: Container.query.all() PortMapping.query.all() VCluster.query.all() except: # create database db.create_all() logger.info ("vcluster start on %s" % (self.addr)) if self.mode == 'new': logger.info ("starting in new mode on %s" % (self.addr)) # check if all clusters data are deleted in httprest.py clean = True usersdir = self.fspath+"/global/users/" vclusters = VCluster.query.all() if len(vclusters) != 0: clean = False for user in os.listdir(usersdir): if len(os.listdir(usersdir+user+"/hosts")) > 0: clean = False if not clean: logger.error ("clusters files not clean, start failed") sys.exit(1) elif self.mode == "recovery": logger.info ("starting in recovery mode on %s" % (self.addr)) self.recover_allclusters() else: logger.error ("not supported mode:%s" % self.mode) sys.exit(1)
G_networkmgr.printpools() G_cloudmgr = cloudmgr.CloudMgr() # start NodeMgr and NodeMgr will wait for all nodes to start ... G_nodemgr = nodemgr.NodeMgr(G_networkmgr, etcdclient, addr=ipaddr, mode=mode) logger.info("nodemgr started") distributedgw = env.getenv("DISTRIBUTED_GATEWAY") G_vclustermgr = vclustermgr.VclusterMgr(G_nodemgr, G_networkmgr, etcdclient, ipaddr, mode, distributedgw) logger.info("vclustermgr started") G_imagemgr = imagemgr.ImageMgr() logger.info("imagemgr started") G_releasemgr = releasemgr.ReleaseMgr(G_vclustermgr, G_ulockmgr) G_releasemgr.start() logger.info("releasemgr started") logger.info("startting to listen on: ") masterip = env.getenv('MASTER_IP') logger.info("using MASTER_IP %s", masterip) masterport = env.getenv('MASTER_PORT') logger.info("using MASTER_PORT %d", int(masterport)) G_historymgr = History_Manager() master_collector = monitor.Master_Collector(G_nodemgr,