Example #1
0
    def __init__(self):
        rpc_pb2_grpc.WorkerServicer.__init__(self)
        etcdaddr = env.getenv("ETCD")
        logger.info("using ETCD %s" % etcdaddr)

        clustername = env.getenv("CLUSTER_NAME")
        logger.info("using CLUSTER_NAME %s" % clustername)

        # init etcdlib client
        try:
            self.etcdclient = etcdlib.Client(etcdaddr, prefix=clustername)
        except Exception:
            logger.error(
                "connect etcd failed, maybe etcd address not correct...")
            sys.exit(1)
        else:
            logger.info("etcd connected")

        # get master ip and report port
        [success, masterip] = self.etcdclient.getkey("service/master")
        if not success:
            logger.error("Fail to get master ip address.")
            sys.exit(1)
        else:
            self.master_ip = masterip
            logger.info("Get master ip address: %s" % (self.master_ip))
        self.master_port = env.getenv('BATCH_MASTER_PORT')

        self.imgmgr = imagemgr.ImageMgr()
        self.fspath = env.getenv('FS_PREFIX')
        self.confpath = env.getenv('DOCKLET_CONF')

        self.taskmsgs = []
        self.msgslock = threading.Lock()
        self.report_interval = 2

        self.lock = threading.Lock()
        self.mount_lock = threading.Lock()
        self.cons_gateway = env.getenv('BATCH_GATEWAY')
        self.cons_ips = env.getenv('BATCH_NET')
        logger.info("Batch gateway ip address %s" % self.cons_gateway)
        logger.info("Batch ip pools %s" % self.cons_ips)

        self.cidr = 32 - int(self.cons_ips.split('/')[1])
        self.ipbase = ip_to_int(self.cons_ips.split('/')[0])
        self.free_ips = []
        for i in range(2, (1 << self.cidr) - 1):
            self.free_ips.append(i)
        logger.info("Free ip addresses pool %s" % str(self.free_ips))

        self.gpu_lock = threading.Lock()
        self.gpu_status = {}
        gpus = gputools.get_gpu_status()
        for gpu in gpus:
            self.gpu_status[gpu['id']] = ""

        self.start_report()
        logger.info('TaskController init success')
Example #2
0
    def __init__(self):
        rpc_pb2_grpc.WorkerServicer.__init__(self)
        etcdaddr = env.getenv("ETCD")
        logger.info ("using ETCD %s" % etcdaddr )

        clustername = env.getenv("CLUSTER_NAME")
        logger.info ("using CLUSTER_NAME %s" % clustername )

        # init etcdlib client
        try:
            self.etcdclient = etcdlib.Client(etcdaddr, prefix = clustername)
        except Exception:
            logger.error ("connect etcd failed, maybe etcd address not correct...")
            sys.exit(1)
        else:
            logger.info("etcd connected")

        # get master ip and report port
        [success,masterip] = self.etcdclient.getkey("service/master")
        if not success:
            logger.error("Fail to get master ip address.")
            sys.exit(1)
        else:
            self.master_ip = masterip
            logger.info("Get master ip address: %s" % (self.master_ip))
        self.master_port = env.getenv('BATCH_MASTER_PORT')

        # get worker ip
        self.worker_ip = getip(env.getenv('NETWORK_DEVICE'))
        logger.info("Worker ip is :%s"%self.worker_ip)

        self.imgmgr = imagemgr.ImageMgr()
        self.fspath = env.getenv('FS_PREFIX')
        self.confpath = env.getenv('DOCKLET_CONF')
        self.rm_all_batch_containers()

        self.taskmsgs = []
        self.msgslock = threading.Lock()
        self.report_interval = 2

        self.lock = threading.Lock()
        self.mount_lock = threading.Lock()

        self.gpu_lock = threading.Lock()
        self.gpu_status = {}
        gpus = gputools.get_gpu_status()
        for gpu in gpus:
            self.gpu_status[gpu['id']] = ""

        self.start_report()
        logger.info('TaskWorker init success')
Example #3
0
    def __init__(self, addr, etcdclient):
        self.addr = addr
        self.etcd = etcdclient
        self.libpath = env.getenv('DOCKLET_LIB')
        self.confpath = env.getenv('DOCKLET_CONF')
        self.fspath = env.getenv('FS_PREFIX')
        # set jupyter running dir in container
        self.rundir = "/home/jupyter"
        # set root running dir in container
        self.nodehome = "/root"

        self.lxcpath = "/var/lib/lxc"
        self.imgmgr = imagemgr.ImageMgr()
        self.historymgr = History_Manager()
Example #4
0
    def __init__(self, nodemgr, networkmgr, etcdclient, addr, mode, distributedgw='False'):
        self.mode = mode
        self.distributedgw = distributedgw
        self.nodemgr = nodemgr
        self.imgmgr = imagemgr.ImageMgr()
        self.networkmgr = networkmgr
        self.addr = addr
        self.etcd = etcdclient
        self.defaultsize = env.getenv("CLUSTER_SIZE")
        self.fspath = env.getenv("FS_PREFIX")
        self.clusterid_locks = threading.Lock()

        # check database
        try:
            Container.query.all()
            PortMapping.query.all()
            VCluster.query.all()
        except:
            # create database
            db.create_all()

        logger.info ("vcluster start on %s" % (self.addr))
        if self.mode == 'new':
            logger.info ("starting in new mode on %s" % (self.addr))
            # check if all clusters data are deleted in httprest.py
            clean = True
            usersdir = self.fspath+"/global/users/"
            vclusters = VCluster.query.all()
            if len(vclusters) != 0:
                clean = False
            for user in os.listdir(usersdir):
                if len(os.listdir(usersdir+user+"/hosts")) > 0:
                    clean = False
            if not clean:
                logger.error ("clusters files not clean, start failed")
                sys.exit(1)
        elif self.mode == "recovery":
            logger.info ("starting in recovery mode on %s" % (self.addr))
            self.recover_allclusters()
        else:
            logger.error ("not supported mode:%s" % self.mode)
            sys.exit(1)
Example #5
0
    G_networkmgr.printpools()

    G_cloudmgr = cloudmgr.CloudMgr()

    # start NodeMgr and NodeMgr will wait for all nodes to start ...
    G_nodemgr = nodemgr.NodeMgr(G_networkmgr,
                                etcdclient,
                                addr=ipaddr,
                                mode=mode)
    logger.info("nodemgr started")
    distributedgw = env.getenv("DISTRIBUTED_GATEWAY")
    G_vclustermgr = vclustermgr.VclusterMgr(G_nodemgr, G_networkmgr,
                                            etcdclient, ipaddr, mode,
                                            distributedgw)
    logger.info("vclustermgr started")
    G_imagemgr = imagemgr.ImageMgr()
    logger.info("imagemgr started")

    G_releasemgr = releasemgr.ReleaseMgr(G_vclustermgr, G_ulockmgr)
    G_releasemgr.start()
    logger.info("releasemgr started")

    logger.info("startting to listen on: ")
    masterip = env.getenv('MASTER_IP')
    logger.info("using MASTER_IP %s", masterip)

    masterport = env.getenv('MASTER_PORT')
    logger.info("using MASTER_PORT %d", int(masterport))

    G_historymgr = History_Manager()
    master_collector = monitor.Master_Collector(G_nodemgr,